0ddccc60e vs cfa1a2c6b - NVFuser codegen diff
0ddccc60e
0ddccc60e change to cudaDeviceGetAttribute for clock & memory rate (#4241)
[browse]
Liqiang Lu <116412316+liqiangxl@users.noreply.github.com>
Fri Apr 11 15:04:06 2025 -0400
cfa1a2c6b
cfa1a2c6b temp
[browse]
Naoya Maruyama <nmaruyama@nvidia.com>
Fri Apr 11 16:15:08 2025 -0700
Command: build/test_nvfuser --gtest_filter=CombinedSchedulerTest.*
GPUs:
['NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n', 'NVIDIA H100 80GB HBM3\n']
nvcc --version
matches between runs
> nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0
Env
matches between runs
BIBINPUTS=.//:/home/nmaruyama/tex/bib//:
BSTINPUTS=.//:/home/nmaruyama/tex/bst//:
CLASSPATH=.
CONDA_DEFAULT_ENV=/opt/conda/pytorch
CONDA_EXE=/opt/conda/miniconda/bin/conda
CONDA_PREFIX=/opt/conda/pytorch
CONDA_PROMPT_MODIFIER=(/opt/conda/pytorch)
CONDA_PYTHON_EXE=/opt/conda/miniconda/bin/python
CONDA_SHLVL=1
CUDA_CACHE_MAX_SIZE=4294967296
CVSROOT=smghome:/home/naoya/cvs
CVS_RSH=ssh
DEBUG_SERDE=disable
ECLIPSE_HOME=/home/nmaruyama/apps/eclipse
EDITOR=vim
GREP_COLOR=33
GTK_IM_MODULE=scim
HOME=/home/nmaruyama
HOSTNAME=debug6.cuda128_clang19_ubuntu24.04_python3.10_pyt
I_MPI_CC=icc
I_MPI_CXX=icpc
I_MPI_F77=ifort
I_MPI_F90=ifort
LANG=C
LC_ALL=en_US.UTF-8
LC_TERMINAL=iTerm2
LC_TERMINAL_VERSION=3.5.11
LD_LIBRARY_PATH=/usr/lib64
LESSGLOBALTAGS=global
LOGNAME=nmaruyama
MOZILLA_FIVE_HOME=/usr/lib/mozilla
NVFUSER_DISABLE=parallel_compile
NVFUSER_DUMP=cuda_to_file,ptxas_verbose,ptx
NVFUSER_ENABLE=static_fusion_count
NVFUSER_TEST_RANDOM_SEED=0
OLDPWD=/home/nmaruyama/scratch/nvfuser
PATH=/opt/conda/pytorch/bin:/opt/conda/miniconda/condabin:/home/nmaruyama/perl5/bin:/home/nmaruyama/miniconda3/bin:/home/nmaruyama/scratch/nvfuser/utils:/home/nmaruyama/bin:/home/nmaruyama/apps/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin:/home/nmaruyama/projects/tools/tau/x86_64/bin:/home/nmaruyama/projects/tools/pdtoolkit/x86_64/bin:/home/nmaruyama/projects/tools/papi/bin:/home/nmaruyama/projects/tools/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/nmaruyama/apps/eclipse
PERL5LIB=/home/nmaruyama/perl5/lib/perl5
PERL_LOCAL_LIB_ROOT=/home/nmaruyama/perl5
PERL_MB_OPT=--install_base "/home/nmaruyama/perl5"
PERL_MM_OPT=INSTALL_BASE=/home/nmaruyama/perl5
PROMPT=%m:%1~$
PWD=/home/nmaruyama/scratch/nvfuser/debug6
QT_IM_MODULE=scim
R_LIBS_USER=/home/nmaruyama/apps/r-lib
SHELL=/bin/zsh
SHLVL=3
SSH_CLIENT=192.168.128.1 36170 22
SSH_CONNECTION=192.168.128.1 36170 192.168.128.5 22
SSH_TTY=/dev/pts/0
SVN_EDITOR=vim
TERM=xterm-256color
TEXINPUTS=.//:/home/nmaruyama/tex/sty//:
TZ=America/Los_Angeles
USER=nmaruyama
_=/usr/bin/printenv
WORDCHARS=*?[]~=&;!#$%^(){}
NVFuser preamble
matches between runs
#ifdef __NVCC__
#include <complex>
#endif // __NVCC__
namespace {
using int8_t = signed char;
using uint8_t = unsigned char;
using int16_t = short int;
using uint16_t = unsigned short int;
using int32_t = int;
using uint32_t = unsigned int;
using int64_t = long long int;
using uint64_t = unsigned long long int;
// Modified from cuda.h
struct TensorMap {
alignas(64)
uint64_t opaque[16];
};
typedef int nvfuser_index_t; // NOTE: index type hard-coded as int for display only
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#ifdef __NVCC__
#include <type_traits>
#else
// The following namespace std is modified from LLVM, see the following
// copyright information
//
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// copy-pasted from some llvm files:
// - https://github.com/llvm/llvm-project/blob/main/libcxx/include/type_traits
// -
// https://github.com/llvm/llvm-project/blob/main/clang/test/Headers/Inputs/include/type_traits
namespace std {
template <class _Tp>
_Tp&& __declval(int);
template <class _Tp>
_Tp __declval(long);
template <class _Tp>
decltype(__declval<_Tp>(0)) declval() noexcept;
template <class _Tp, _Tp __v>
struct integral_constant {
static const _Tp value = __v;
typedef _Tp value_type;
typedef integral_constant type;
};
typedef integral_constant<bool, true> true_type;
typedef integral_constant<bool, false> false_type;
// is_same, functional
template <class _Tp, class _Up>
struct is_same : public false_type {};
template <class _Tp>
struct is_same<_Tp, _Tp> : public true_type {};
template <class T, class U>
constexpr bool is_same_v = is_same<T, U>::value;
// is_integral, for some types.
template <class _Tp>
struct is_integral : public integral_constant<bool, false> {};
template <>
struct is_integral<bool> : public integral_constant<bool, true> {};
template <>
struct is_integral<char> : public integral_constant<bool, true> {};
template <>
struct is_integral<short> : public integral_constant<bool, true> {};
template <>
struct is_integral<int> : public integral_constant<bool, true> {};
template <>
struct is_integral<long> : public integral_constant<bool, true> {};
template <>
struct is_integral<long long> : public integral_constant<bool, true> {};
// enable_if, functional
template <bool _C, typename _Tp>
struct enable_if {};
template <typename _Tp>
struct enable_if<true, _Tp> {
using type = _Tp;
};
template <bool b, class T = void>
using enable_if_t = typename enable_if<b, T>::type;
template <class _Tp>
struct remove_const {
typedef _Tp type;
};
template <class _Tp>
struct remove_const<const _Tp> {
typedef _Tp type;
};
template <class _Tp>
using remove_const_t = typename remove_const<_Tp>::type;
template <class _Tp>
struct remove_volatile {
typedef _Tp type;
};
template <class _Tp>
struct remove_volatile<volatile _Tp> {
typedef _Tp type;
};
template <class _Tp>
using remove_volatile_t = typename remove_volatile<_Tp>::type;
template <class _Tp>
struct remove_cv {
typedef typename remove_volatile<typename remove_const<_Tp>::type>::type type;
};
template <class _Tp>
using remove_cv_t = typename remove_cv<_Tp>::type;
template <class _Tp>
struct __libcpp_is_floating_point : public false_type {};
template <>
struct __libcpp_is_floating_point<float> : public true_type {};
template <>
struct __libcpp_is_floating_point<double> : public true_type {};
template <>
struct __libcpp_is_floating_point<long double> : public true_type {};
template <class _Tp>
struct is_floating_point
: public __libcpp_is_floating_point<typename remove_cv<_Tp>::type> {};
template <class _Tp>
struct is_arithmetic
: public integral_constant<
bool,
is_integral<_Tp>::value || is_floating_point<_Tp>::value> {};
template <class _Tp>
inline constexpr bool is_arithmetic_v = is_arithmetic<_Tp>::value;
template <class _Tp>
struct __numeric_type {
static void __test(...);
static float __test(float);
static double __test(char);
static double __test(int);
static double __test(unsigned);
static double __test(long);
static double __test(unsigned long);
static double __test(long long);
static double __test(unsigned long long);
static double __test(double);
static long double __test(long double);
typedef decltype(__test(declval<_Tp>())) type;
static const bool value = !is_same<type, void>::value;
};
template <>
struct __numeric_type<void> {
static const bool value = true;
};
// __promote
template <
class _A1,
class _A2 = void,
class _A3 = void,
bool = __numeric_type<_A1>::value && __numeric_type<_A2>::value &&
__numeric_type<_A3>::value>
class __promote_imp {
public:
static const bool value = false;
};
template <class _A1, class _A2, class _A3>
class __promote_imp<_A1, _A2, _A3, true> {
private:
typedef typename __promote_imp<_A1>::type __type1;
typedef typename __promote_imp<_A2>::type __type2;
typedef typename __promote_imp<_A3>::type __type3;
public:
typedef decltype(__type1() + __type2() + __type3()) type;
static const bool value = true;
};
template <class _A1, class _A2>
class __promote_imp<_A1, _A2, void, true> {
private:
typedef typename __promote_imp<_A1>::type __type1;
typedef typename __promote_imp<_A2>::type __type2;
public:
typedef decltype(__type1() + __type2()) type;
static const bool value = true;
};
template <class _A1>
class __promote_imp<_A1, void, void, true> {
public:
typedef typename __numeric_type<_A1>::type type;
static const bool value = true;
};
template <class _A1, class _A2 = void, class _A3 = void>
class __promote : public __promote_imp<_A1, _A2, _A3> {};
} // namespace std
#endif
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#ifdef __NVCC__
#include <bit>
#else
namespace std {
template <class To, class From>
std::enable_if_t<sizeof(To) == sizeof(From), To> bit_cast(
const From& src) noexcept {
return *reinterpret_cast<const To*>(&src);
}
} // namespace std
// Intentionally not supporting signed integers to stay consistent with
// https://en.cppreference.com/w/cpp/numeric/bit_ceil
__device__ __forceinline__ unsigned int bit_ceil(unsigned int x) {
if (x == 0) {
return 1;
}
return 1u << (32 - __clz(x - 1));
}
__device__ __forceinline__ unsigned long long bit_ceil(unsigned long long x) {
if (x == 0) {
return 1;
}
return 1ull << (64 - __clzll(x - 1));
}
#endif
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#ifndef __NVCC__
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
//===----------------------------------------------------------------------===//
// The following namespace std is modified from LLVM, see the following
// copyright information
//
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// copy-pasted from the following llvm file:
// https://github.com/llvm/llvm-project/blob/main/libcxx/include/complex
namespace std {
template <class _Tp>
class complex;
template <class _Tp>
complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w);
template <class _Tp>
complex<_Tp> operator/(const complex<_Tp>& __x, const complex<_Tp>& __y);
template <class _Tp>
class complex {
public:
typedef _Tp value_type;
private:
value_type __re_;
value_type __im_;
public:
constexpr complex(
const value_type& __re = value_type(),
const value_type& __im = value_type())
: __re_(__re), __im_(__im) {}
template <class _Xp>
constexpr complex(const complex<_Xp>& __c)
: __re_(__c.real()), __im_(__c.imag()) {}
constexpr value_type real() const {
return __re_;
}
constexpr value_type imag() const {
return __im_;
}
void real(value_type __re) {
__re_ = __re;
}
void imag(value_type __im) {
__im_ = __im;
}
constexpr operator bool() const {
return real() || imag();
}
complex& operator=(const value_type& __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
complex& operator+=(const value_type& __re) {
__re_ += __re;
return *this;
}
complex& operator-=(const value_type& __re) {
__re_ -= __re;
return *this;
}
complex& operator*=(const value_type& __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
complex& operator/=(const value_type& __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
complex& operator=(const complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
complex& operator+=(const complex<_Xp>& __c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
complex& operator-=(const complex<_Xp>& __c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
complex& operator*=(const complex<_Xp>& __c) {
*this = *this * complex(__c.real(), __c.imag());
return *this;
}
template <class _Xp>
complex& operator/=(const complex<_Xp>& __c) {
*this = *this / complex(__c.real(), __c.imag());
return *this;
}
};
template <>
class complex<double>;
template <>
class complex<float> {
float __re_;
float __im_;
public:
typedef float value_type;
constexpr complex(float __re = 0.0f, float __im = 0.0f)
: __re_(__re), __im_(__im) {}
explicit constexpr complex(const complex<double>& __c);
// copy volatile to non-volatile
constexpr complex(const volatile complex<float>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr complex(const complex<float>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr float real() const {
return __re_;
}
constexpr float imag() const {
return __im_;
}
void real(value_type __re) {
__re_ = __re;
}
void imag(value_type __im) {
__im_ = __im;
}
constexpr operator bool() const {
return real() || imag();
}
complex& operator=(float __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
complex& operator+=(float __re) {
__re_ += __re;
return *this;
}
complex& operator-=(float __re) {
__re_ -= __re;
return *this;
}
complex& operator*=(float __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
complex& operator/=(float __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
complex& operator=(const complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// non-volatile to volatile
template <class _Xp>
volatile complex& operator=(const complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to non-volatile
template <class _Xp>
complex& operator=(const volatile complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to volatile
template <class _Xp>
volatile complex& operator=(const volatile complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
complex& operator+=(const complex<_Xp>& __c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
complex& operator-=(const complex<_Xp>& __c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
complex& operator*=(const complex<_Xp>& __c) {
*this = *this * complex(__c.real(), __c.imag());
return *this;
}
template <class _Xp>
complex& operator/=(const complex<_Xp>& __c) {
*this = *this / complex(__c.real(), __c.imag());
return *this;
}
};
template <>
class complex<double> {
double __re_;
double __im_;
public:
typedef double value_type;
constexpr complex(double __re = 0.0, double __im = 0.0)
: __re_(__re), __im_(__im) {}
constexpr complex(const complex<float>& __c);
// copy volatile to non-volatile
constexpr complex(const volatile complex<double>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr complex(const complex<double>& other)
: __re_(other.__re_), __im_(other.__im_) {}
constexpr double real() const {
return __re_;
}
constexpr double imag() const {
return __im_;
}
void real(value_type __re) {
__re_ = __re;
}
void imag(value_type __im) {
__im_ = __im;
}
constexpr operator bool() const {
return real() || imag();
}
complex& operator=(double __re) {
__re_ = __re;
__im_ = value_type();
return *this;
}
complex& operator+=(double __re) {
__re_ += __re;
return *this;
}
complex& operator-=(double __re) {
__re_ -= __re;
return *this;
}
complex& operator*=(double __re) {
__re_ *= __re;
__im_ *= __re;
return *this;
}
complex& operator/=(double __re) {
__re_ /= __re;
__im_ /= __re;
return *this;
}
template <class _Xp>
complex& operator=(const complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// non-volatile to volatile
template <class _Xp>
volatile complex& operator=(const complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to non-volatile
template <class _Xp>
complex& operator=(const volatile complex<_Xp>& __c) {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
// volatile to volatile
template <class _Xp>
volatile complex& operator=(const volatile complex<_Xp>& __c) volatile {
__re_ = __c.real();
__im_ = __c.imag();
return *this;
}
template <class _Xp>
complex& operator+=(const complex<_Xp>& __c) {
__re_ += __c.real();
__im_ += __c.imag();
return *this;
}
template <class _Xp>
complex& operator-=(const complex<_Xp>& __c) {
__re_ -= __c.real();
__im_ -= __c.imag();
return *this;
}
template <class _Xp>
complex& operator*=(const complex<_Xp>& __c) {
*this = *this * complex(__c.real(), __c.imag());
return *this;
}
template <class _Xp>
complex& operator/=(const complex<_Xp>& __c) {
*this = *this / complex(__c.real(), __c.imag());
return *this;
}
};
inline constexpr complex<float>::complex(const complex<double>& __c)
: __re_(__c.real()), __im_(__c.imag()) {}
inline constexpr complex<double>::complex(const complex<float>& __c)
: __re_(__c.real()), __im_(__c.imag()) {}
// 26.3.6 operators:
template <class _Tp>
inline complex<_Tp> operator+(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator+(const complex<_Tp>& __x, const _Tp& __y) {
complex<_Tp> __t(__x);
__t += __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator+(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(__y);
__t += __x;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator-(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator-(const complex<_Tp>& __x, const _Tp& __y) {
complex<_Tp> __t(__x);
__t -= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator-(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(-__y);
__t += __x;
return __t;
}
template <class _Tp>
complex<_Tp> operator*(const complex<_Tp>& __z, const complex<_Tp>& __w) {
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __ac = __a * __c;
_Tp __bd = __b * __d;
_Tp __ad = __a * __d;
_Tp __bc = __b * __c;
_Tp __x = __ac - __bd;
_Tp __y = __ad + __bc;
if (isnan(__x) && isnan(__y)) {
bool __recalc = false;
if (isinf(__a) || isinf(__b)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (isinf(__c) || isinf(__d)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
__recalc = true;
}
if (!__recalc &&
(isinf(__ac) || isinf(__bd) || isinf(__ad) || isinf(__bc))) {
if (isnan(__a))
__a = copysign(_Tp(0), __a);
if (isnan(__b))
__b = copysign(_Tp(0), __b);
if (isnan(__c))
__c = copysign(_Tp(0), __c);
if (isnan(__d))
__d = copysign(_Tp(0), __d);
__recalc = true;
}
if (__recalc) {
__x = _Tp(INFINITY) * (__a * __c - __b * __d);
__y = _Tp(INFINITY) * (__a * __d + __b * __c);
}
}
return complex<_Tp>(__x, __y);
}
template <class _Tp>
inline complex<_Tp> operator*(const complex<_Tp>& __x, const _Tp& __y) {
complex<_Tp> __t(__x);
__t *= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator*(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(__y);
__t *= __x;
return __t;
}
template <class _Tp>
complex<_Tp> operator/(const complex<_Tp>& __z, const complex<_Tp>& __w) {
int __ilogbw = 0;
_Tp __a = __z.real();
_Tp __b = __z.imag();
_Tp __c = __w.real();
_Tp __d = __w.imag();
_Tp __logbw = logb(fmax(fabs(__c), fabs(__d)));
if (isfinite(__logbw)) {
__ilogbw = static_cast<int>(__logbw);
__c = scalbn(__c, -__ilogbw);
__d = scalbn(__d, -__ilogbw);
}
_Tp __denom = __c * __c + __d * __d;
_Tp __x = scalbn((__a * __c + __b * __d) / __denom, -__ilogbw);
_Tp __y = scalbn((__b * __c - __a * __d) / __denom, -__ilogbw);
if (isnan(__x) && isnan(__y)) {
if ((__denom == _Tp(0)) && (!isnan(__a) || !isnan(__b))) {
__x = copysign(_Tp(INFINITY), __c) * __a;
__y = copysign(_Tp(INFINITY), __c) * __b;
} else if ((isinf(__a) || isinf(__b)) && isfinite(__c) && isfinite(__d)) {
__a = copysign(isinf(__a) ? _Tp(1) : _Tp(0), __a);
__b = copysign(isinf(__b) ? _Tp(1) : _Tp(0), __b);
__x = _Tp(INFINITY) * (__a * __c + __b * __d);
__y = _Tp(INFINITY) * (__b * __c - __a * __d);
} else if (
isinf(__logbw) && __logbw > _Tp(0) && isfinite(__a) && isfinite(__b)) {
__c = copysign(isinf(__c) ? _Tp(1) : _Tp(0), __c);
__d = copysign(isinf(__d) ? _Tp(1) : _Tp(0), __d);
__x = _Tp(0) * (__a * __c + __b * __d);
__y = _Tp(0) * (__b * __c - __a * __d);
}
}
return complex<_Tp>(__x, __y);
}
template <class _Tp>
inline complex<_Tp> operator/(const complex<_Tp>& __x, const _Tp& __y) {
return complex<_Tp>(__x.real() / __y, __x.imag() / __y);
}
template <class _Tp>
inline complex<_Tp> operator/(const _Tp& __x, const complex<_Tp>& __y) {
complex<_Tp> __t(__x);
__t /= __y;
return __t;
}
template <class _Tp>
inline complex<_Tp> operator+(const complex<_Tp>& __x) {
return __x;
}
template <class _Tp>
inline complex<_Tp> operator-(const complex<_Tp>& __x) {
return complex<_Tp>(-__x.real(), -__x.imag());
}
template <class _Tp>
inline constexpr bool operator==(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return __x.real() == __y.real() && __x.imag() == __y.imag();
}
template <class _Tp>
inline constexpr bool operator==(const complex<_Tp>& __x, const _Tp& __y) {
return __x.real() == __y && __x.imag() == 0;
}
template <class _Tp>
inline constexpr bool operator==(const _Tp& __x, const complex<_Tp>& __y) {
return __x == __y.real() && 0 == __y.imag();
}
template <class _Tp>
inline constexpr bool operator!=(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return !(__x == __y);
}
template <class _Tp>
inline constexpr bool operator!=(const complex<_Tp>& __x, const _Tp& __y) {
return !(__x == __y);
}
template <class _Tp>
inline constexpr bool operator!=(const _Tp& __x, const complex<_Tp>& __y) {
return !(__x == __y);
}
template <class _Tp>
inline constexpr bool operator&&(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return bool(__x) && bool(__y);
}
template <class _Tp>
inline constexpr bool isnan(const complex<_Tp>& __x) {
return isnan(__x.real()) || isnan(__x.imag());
}
template <class _Tp>
inline constexpr bool operator||(
const complex<_Tp>& __x,
const complex<_Tp>& __y) {
return bool(__x) || bool(__y);
}
// 26.3.7 values:
template <
class _Tp,
bool = is_integral<_Tp>::value,
bool = is_floating_point<_Tp>::value>
struct __libcpp_complex_overload_traits {};
// Integral Types
template <class _Tp>
struct __libcpp_complex_overload_traits<_Tp, true, false> {
typedef double _ValueType;
typedef complex<double> _ComplexType;
};
// Floating point types
template <class _Tp>
struct __libcpp_complex_overload_traits<_Tp, false, true> {
typedef _Tp _ValueType;
typedef complex<_Tp> _ComplexType;
};
// real
template <class _Tp>
inline constexpr _Tp real(const complex<_Tp>& __c) {
return __c.real();
}
template <class _Tp>
inline constexpr typename __libcpp_complex_overload_traits<_Tp>::_ValueType real(
_Tp __re) {
return __re;
}
// imag
template <class _Tp>
inline constexpr _Tp imag(const complex<_Tp>& __c) {
return __c.imag();
}
template <class _Tp>
inline constexpr typename __libcpp_complex_overload_traits<_Tp>::_ValueType imag(
_Tp) {
return 0;
}
// abs
template <class _Tp>
inline _Tp abs(const complex<_Tp>& __c) {
return hypot(__c.real(), __c.imag());
}
// arg
template <class _Tp>
inline _Tp arg(const complex<_Tp>& __c) {
return atan2(__c.imag(), __c.real());
}
template <class _Tp>
inline typename enable_if<
is_integral<_Tp>::value || is_same<_Tp, double>::value,
double>::type
arg(_Tp __re) {
return atan2(0., __re);
}
template <class _Tp>
inline typename enable_if<is_same<_Tp, float>::value, float>::type arg(
_Tp __re) {
return atan2f(0.F, __re);
}
} // namespace std
namespace std {
using ::isfinite;
using ::isinf;
using ::isnan;
using ::signbit;
using ::abs;
using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atan2;
using ::atan2f;
using ::atanf;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fmod;
using ::fmodf;
using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;
using ::log;
using ::logf;
using ::log10;
using ::log10f;
using ::modf;
using ::modff;
using ::pow;
using ::powf;
using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;
using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;
using ::tanh;
using ::tanhf;
using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;
using ::copysign;
using ::copysignf;
using ::erf;
using ::erfc;
using ::erfcf;
using ::erff;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fma;
using ::fmaf;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;
} // namespace std
namespace std {
// norm
template <class _Tp>
inline _Tp norm(const complex<_Tp>& __c) {
if (isinf(__c.real()))
return abs(__c.real());
if (isinf(__c.imag()))
return abs(__c.imag());
return __c.real() * __c.real() + __c.imag() * __c.imag();
}
template <class _Tp>
inline typename __libcpp_complex_overload_traits<_Tp>::_ValueType norm(
_Tp __re) {
typedef typename __libcpp_complex_overload_traits<_Tp>::_ValueType _ValueType;
return static_cast<_ValueType>(__re) * __re;
}
// conj
template <class _Tp>
inline complex<_Tp> conj(const complex<_Tp>& __c) {
return complex<_Tp>(__c.real(), -__c.imag());
}
template <class _Tp>
inline typename __libcpp_complex_overload_traits<_Tp>::_ComplexType conj(
_Tp __re) {
typedef
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType;
return _ComplexType(__re);
}
// proj
template <class _Tp>
inline complex<_Tp> proj(const complex<_Tp>& __c) {
complex<_Tp> __r = __c;
if (isinf(__c.real()) || isinf(__c.imag()))
__r = complex<_Tp>(INFINITY, copysign(_Tp(0), __c.imag()));
return __r;
}
template <class _Tp>
inline typename enable_if<
is_floating_point<_Tp>::value,
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType>::type
proj(_Tp __re) {
if (isinf(__re))
__re = abs(__re);
return complex<_Tp>(__re);
}
template <class _Tp>
inline typename enable_if<
is_integral<_Tp>::value,
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType>::type
proj(_Tp __re) {
typedef
typename __libcpp_complex_overload_traits<_Tp>::_ComplexType _ComplexType;
return _ComplexType(__re);
}
// polar
template <class _Tp>
complex<_Tp> polar(const _Tp& __rho, const _Tp& __theta = _Tp()) {
if (isnan(__rho) || signbit(__rho))
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
if (isnan(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, __theta);
return complex<_Tp>(__theta, __theta);
}
if (isinf(__theta)) {
if (isinf(__rho))
return complex<_Tp>(__rho, _Tp(NAN));
return complex<_Tp>(_Tp(NAN), _Tp(NAN));
}
_Tp __x = __rho * cos(__theta);
if (isnan(__x))
__x = 0;
_Tp __y = __rho * sin(__theta);
if (isnan(__y))
__y = 0;
return complex<_Tp>(__x, __y);
}
// log
template <class _Tp>
inline complex<_Tp> log(const complex<_Tp>& __x) {
return complex<_Tp>(log(abs(__x)), arg(__x));
}
// log10
template <class _Tp>
inline complex<_Tp> log10(const complex<_Tp>& __x) {
return log(__x) / log(_Tp(10));
}
// log2
template <class _Tp>
inline complex<_Tp> log2(const complex<_Tp>& __x) {
return log(__x) / log(_Tp(2));
}
// sqrt
template <class _Tp>
complex<_Tp> sqrt(const complex<_Tp>& __x) {
if (isinf(__x.imag()))
return complex<_Tp>(_Tp(INFINITY), __x.imag());
if (isinf(__x.real())) {
if (__x.real() > _Tp(0))
return complex<_Tp>(
__x.real(),
isnan(__x.imag()) ? __x.imag() : copysign(_Tp(0), __x.imag()));
return complex<_Tp>(
isnan(__x.imag()) ? __x.imag() : _Tp(0),
copysign(__x.real(), __x.imag()));
}
return polar(sqrt(abs(__x)), arg(__x) / _Tp(2));
}
// exp
template <class _Tp>
complex<_Tp> exp(const complex<_Tp>& __x) {
_Tp __i = __x.imag();
if (__i == 0) {
return complex<_Tp>(exp(__x.real()), copysign(_Tp(0), __x.imag()));
}
if (isinf(__x.real())) {
if (__x.real() < _Tp(0)) {
if (!isfinite(__i))
__i = _Tp(1);
} else if (__i == 0 || !isfinite(__i)) {
if (isinf(__i))
__i = _Tp(NAN);
return complex<_Tp>(__x.real(), __i);
}
}
_Tp __e = exp(__x.real());
return complex<_Tp>(__e * cos(__i), __e * sin(__i));
}
// pow
template <class _Tp>
inline complex<_Tp> pow(const complex<_Tp>& __x, const complex<_Tp>& __y) {
return exp(__y * log(__x));
}
template <class _Tp, class _Up>
inline complex<typename __promote<_Tp, _Up>::type> pow(
const complex<_Tp>& __x,
const complex<_Up>& __y) {
typedef complex<typename __promote<_Tp, _Up>::type> result_type;
return std::pow(result_type(__x), result_type(__y));
}
template <class _Tp, class _Up>
inline typename enable_if<
is_arithmetic<_Up>::value,
complex<typename __promote<_Tp, _Up>::type>>::type
pow(const complex<_Tp>& __x, const _Up& __y) {
typedef complex<typename __promote<_Tp, _Up>::type> result_type;
return std::pow(result_type(__x), result_type(__y));
}
template <class _Tp, class _Up>
inline typename enable_if<
is_arithmetic<_Tp>::value,
complex<typename __promote<_Tp, _Up>::type>>::type
pow(const _Tp& __x, const complex<_Up>& __y) {
typedef complex<typename __promote<_Tp, _Up>::type> result_type;
return std::pow(result_type(__x), result_type(__y));
}
// __sqr, computes pow(x, 2)
template <class _Tp>
inline complex<_Tp> __sqr(const complex<_Tp>& __x) {
return complex<_Tp>(
(__x.real() - __x.imag()) * (__x.real() + __x.imag()),
_Tp(2) * __x.real() * __x.imag());
}
// asinh
template <class _Tp>
complex<_Tp> asinh(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return __x;
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (__x.imag() == 0)
return __x;
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(
copysign(__x.imag(), __x.real()), copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(__sqr(__x) + _Tp(1)));
return complex<_Tp>(
copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
}
// acosh
template <class _Tp>
complex<_Tp> acosh(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(abs(__x.real()), __x.imag());
if (isinf(__x.imag())) {
if (__x.real() > 0)
return complex<_Tp>(__x.real(), copysign(__pi * _Tp(0.25), __x.imag()));
else
return complex<_Tp>(
-__x.real(), copysign(__pi * _Tp(0.75), __x.imag()));
}
if (__x.real() < 0)
return complex<_Tp>(-__x.real(), copysign(__pi, __x.imag()));
return complex<_Tp>(__x.real(), copysign(_Tp(0), __x.imag()));
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(abs(__x.imag()), __x.real());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(abs(__x.imag()), copysign(__pi / _Tp(2), __x.imag()));
complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
return complex<_Tp>(
copysign(__z.real(), _Tp(0)), copysign(__z.imag(), __x.imag()));
}
// atanh
template <class _Tp>
complex<_Tp> atanh(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.imag())) {
return complex<_Tp>(
copysign(_Tp(0), __x.real()), copysign(__pi / _Tp(2), __x.imag()));
}
if (isnan(__x.imag())) {
if (isinf(__x.real()) || __x.real() == 0)
return complex<_Tp>(copysign(_Tp(0), __x.real()), __x.imag());
return complex<_Tp>(__x.imag(), __x.imag());
}
if (isnan(__x.real())) {
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.real())) {
return complex<_Tp>(
copysign(_Tp(0), __x.real()), copysign(__pi / _Tp(2), __x.imag()));
}
if (abs(__x.real()) == _Tp(1) && __x.imag() == _Tp(0)) {
return complex<_Tp>(
copysign(_Tp(INFINITY), __x.real()), copysign(_Tp(0), __x.imag()));
}
complex<_Tp> __z = log((_Tp(1) + __x) / (_Tp(1) - __x)) / _Tp(2);
return complex<_Tp>(
copysign(__z.real(), __x.real()), copysign(__z.imag(), __x.imag()));
}
// sinh
template <class _Tp>
complex<_Tp> sinh(const complex<_Tp>& __x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(__x.real(), _Tp(NAN));
if (__x.imag() == 0 && !isfinite(__x.real()))
return __x;
return complex<_Tp>(
sinh(__x.real()) * cos(__x.imag()), cosh(__x.real()) * sin(__x.imag()));
}
// cosh
template <class _Tp>
complex<_Tp> cosh(const complex<_Tp>& __x) {
if (isinf(__x.real()) && !isfinite(__x.imag()))
return complex<_Tp>(abs(__x.real()), _Tp(NAN));
if (__x.real() == 0 && !isfinite(__x.imag()))
return complex<_Tp>(_Tp(NAN), __x.real());
if (__x.real() == 0 && __x.imag() == 0)
return complex<_Tp>(_Tp(1), __x.imag());
if (__x.imag() == 0 && !isfinite(__x.real()))
return complex<_Tp>(abs(__x.real()), __x.imag());
return complex<_Tp>(
cosh(__x.real()) * cos(__x.imag()), sinh(__x.real()) * sin(__x.imag()));
}
// tanh
template <class _Tp>
complex<_Tp> tanh(const complex<_Tp>& __x) {
if (isinf(__x.real())) {
if (!isfinite(__x.imag()))
return complex<_Tp>(copysign(_Tp(1), __x.real()), _Tp(0));
return complex<_Tp>(
copysign(_Tp(1), __x.real()),
copysign(_Tp(0), sin(_Tp(2) * __x.imag())));
}
if (isnan(__x.real()) && __x.imag() == 0)
return __x;
_Tp __2r(_Tp(2) * __x.real());
_Tp __2i(_Tp(2) * __x.imag());
_Tp __d(cosh(__2r) + cos(__2i));
_Tp __2rsh(sinh(__2r));
if (isinf(__2rsh) && isinf(__d))
return complex<_Tp>(
__2rsh > _Tp(0) ? _Tp(1) : _Tp(-1), __2i > _Tp(0) ? _Tp(0) : _Tp(-0.));
return complex<_Tp>(__2rsh / __d, sin(__2i) / __d);
}
// asin
template <class _Tp>
complex<_Tp> asin(const complex<_Tp>& __x) {
complex<_Tp> __z = asinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// acos
template <class _Tp>
complex<_Tp> acos(const complex<_Tp>& __x) {
const _Tp __pi(atan2(+0., -0.));
if (isinf(__x.real())) {
if (isnan(__x.imag()))
return complex<_Tp>(__x.imag(), __x.real());
if (isinf(__x.imag())) {
if (__x.real() < _Tp(0))
return complex<_Tp>(_Tp(0.75) * __pi, -__x.imag());
return complex<_Tp>(_Tp(0.25) * __pi, -__x.imag());
}
if (__x.real() < _Tp(0))
return complex<_Tp>(__pi, signbit(__x.imag()) ? -__x.real() : __x.real());
return complex<_Tp>(_Tp(0), signbit(__x.imag()) ? __x.real() : -__x.real());
}
if (isnan(__x.real())) {
if (isinf(__x.imag()))
return complex<_Tp>(__x.real(), -__x.imag());
return complex<_Tp>(__x.real(), __x.real());
}
if (isinf(__x.imag()))
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
if (__x.real() == 0 && (__x.imag() == 0 || isnan(__x.imag())))
return complex<_Tp>(__pi / _Tp(2), -__x.imag());
complex<_Tp> __z = log(__x + sqrt(__sqr(__x) - _Tp(1)));
if (signbit(__x.imag()))
return complex<_Tp>(abs(__z.imag()), abs(__z.real()));
return complex<_Tp>(abs(__z.imag()), -abs(__z.real()));
}
// atan
template <class _Tp>
complex<_Tp> atan(const complex<_Tp>& __x) {
complex<_Tp> __z = atanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// sin
template <class _Tp>
complex<_Tp> sin(const complex<_Tp>& __x) {
complex<_Tp> __z = sinh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// cos
template <class _Tp>
inline complex<_Tp> cos(const complex<_Tp>& __x) {
return cosh(complex<_Tp>(-__x.imag(), __x.real()));
}
// tan
template <class _Tp>
complex<_Tp> tan(const complex<_Tp>& __x) {
complex<_Tp> __z = tanh(complex<_Tp>(-__x.imag(), __x.real()));
return complex<_Tp>(__z.imag(), -__z.real());
}
// Literal suffix for complex number literals [complex.literals]
inline namespace literals {
inline namespace complex_literals {
constexpr complex<double> operator""i(long double __im) {
return {0.0, static_cast<double>(__im)};
}
constexpr complex<double> operator""i(unsigned long long __im) {
return {0.0, static_cast<double>(__im)};
}
constexpr complex<float> operator""if(long double __im) {
return {0.0f, static_cast<float>(__im)};
}
constexpr complex<float> operator""if(unsigned long long __im) {
return {0.0f, static_cast<float>(__im)};
}
} // namespace complex_literals
} // namespace literals
} // namespace std
__device__ std::complex<double> lerp(
std::complex<double> start,
std::complex<double> end,
std::complex<double> weight) {
if (abs(weight) < 0.5) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0 - weight);
}
}
__device__ std::complex<float> lerp(
std::complex<float> start,
std::complex<float> end,
std::complex<float> weight) {
if (abs(weight) < 0.5f) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0f - weight);
}
}
__device__ std::complex<double> reciprocal(std::complex<double> x) {
return 1.0 / x;
}
__device__ std::complex<float> reciprocal(std::complex<float> x) {
return 1.0f / x;
}
__device__ std::complex<double> sigmoid(std::complex<double> x) {
return 1.0 / (1.0 + exp(-x));
}
__device__ std::complex<float> sigmoid(std::complex<float> x) {
return 1.0f / (1.0f + exp(-x));
}
// The reciprocal of a complex number z is
// 1/z = conj(z)/|z|^2.
// The principal square root of a complex number z can be obtained by [1]
// sqrt(z) = sqrt(|z|) (z + |z|) / |z + |z||.
// Combining these formulas we have
// 1/sqrt(z) = (conj(z) + |z|) / (sqrt(|z|) |z + |z||).
// [1] https://math.stackexchange.com/a/44500
__device__ std::complex<float> rsqrt(std::complex<float> z) {
auto a = std::real(z);
auto b = std::imag(z);
auto absa = ::fabsf(a);
auto absb = ::fabsf(b);
// scale to avoid precision loss due to underflow/overflow
auto scale = fmax(absa, absb);
a /= scale;
b /= scale;
auto a_sq = a * a;
auto b_sq = b * b;
auto modz_sq = a_sq + b_sq;
auto modz = ::sqrtf(modz_sq);
auto a_plus_modz = a + modz;
auto mod_zplusmodz_sq = a_plus_modz * a_plus_modz + b_sq;
auto fac = ::rsqrtf(scale * modz * mod_zplusmodz_sq);
return std::complex<float>(a_plus_modz * fac, -b * fac);
}
__device__ std::complex<double> rsqrt(std::complex<double> z) {
auto a = std::real(z);
auto b = std::imag(z);
auto absa = ::abs(a);
auto absb = ::abs(b);
// scale to avoid precision loss due to underflow/overflow
auto scale = fmax(absa, absb);
a /= scale;
b /= scale;
auto a_sq = a * a;
auto b_sq = b * b;
auto modz_sq = a_sq + b_sq;
auto modz = ::sqrt(modz_sq);
auto a_plus_modz = a + modz;
auto mod_zplusmodz_sq = a_plus_modz * a_plus_modz + b_sq;
auto fac = ::rsqrt(scale * modz * mod_zplusmodz_sq);
return std::complex<double>(a_plus_modz * fac, -b * fac);
}
template <typename T>
bool isfinite(std::complex<T> x) {
return ::isfinite(std::real(x)) && ::isfinite(std::imag(x));
}
template <typename T>
bool isinf(std::complex<T> x) {
return ::isinf(std::real(x)) || ::isinf(std::imag(x));
}
template <typename T>
bool isreal(std::complex<T> x) {
return std::imag(x) == 0;
}
#endif // __NVCC__
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#define __NVFUSER_HALF_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_HALF_TO_CUS(var) \
*(reinterpret_cast<const unsigned short*>(&(var)))
struct __half;
__device__ __inline__ __half __float2half(const float);
struct __align__(2) __half {
__half() = default;
__half(const __half& other) {
__x = other.__x;
}
__half(const __half&& other) {
__x = other.__x;
}
__half(const volatile __half& other) {
__x = other.__x;
}
__half(const volatile __half&& other) {
__x = other.__x;
}
// Note: not returning reference for `__half::operator=`
// Doing so would requires us to return `volatile __half&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __half& other) {
__x = other.__x;
}
__device__ void operator=(const __half&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __half& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __half&& other) {
__x = other.__x;
}
__device__ void operator=(const __half& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __half&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __half& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __half&& other) volatile {
__x = other.__x;
}
__device__ __half(const float f) {
__x = __float2half(f).__x;
}
__device__ uint16_t raw() const {
return __x;
}
protected:
unsigned short __x;
};
__device__ __inline__ __half __float2half(const float f) {
__half val;
asm("{ cvt.rn.f16.f32 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "f"(f));
return val;
}
__device__ __inline__ __half __double2half(const double d) {
__half val;
asm("{ cvt.rn.f16.f64 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "d"(d));
return val;
}
__device__ __inline__ __half __int2half(const int i) {
__half val;
asm("{ cvt.rn.f16.s32 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "r"(i));
return val;
}
__device__ __inline__ __half __int2half(const int64_t i64) {
__half val;
asm("{ cvt.rn.f16.s64 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "l"(i64));
return val;
}
__device__ __inline__ __half __int2half(const uint32_t i) {
__half val;
asm("{ cvt.rn.f16.u32 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "r"(i));
return val;
}
__device__ __inline__ __half __int2half(const uint64_t i64) {
__half val;
asm("{ cvt.rn.f16.u64 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "l"(i64));
return val;
}
__device__ __inline__ __half __bool2half(const bool b) {
return __int2half((int)b);
}
__device__ __inline__ float __half2float(const __half h) {
float val;
asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ double __half2double(const __half h) {
double val;
asm("{ cvt.f64.f16 %0, %1;}\n" : "=d"(val) : "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ int __half2int32(const __half h) {
int val;
asm("{ cvt.rzi.s32.f16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ int64_t __half2int(const __half h) {
int64_t val;
asm("{ cvt.rzi.s64.f16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ int __half2uint32(const __half h) {
int val;
asm("{ cvt.rzi.u32.f16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ int64_t __half2uint(const __half h) {
int64_t val;
asm("{ cvt.rzi.u64.f16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
}
__device__ __inline__ void __half2int(const __half h, int& output) {
output = __half2int32(h);
}
__device__ __inline__ void __half2int(const __half h, int64_t& output) {
output = __half2int(h);
}
__device__ __inline__ void __half2int(const __half h, uint32_t& output) {
output = __half2uint32(h);
}
__device__ __inline__ void __half2int(const __half h, uint64_t& output) {
output = __half2uint(h);
}
__device__ __inline__ nvfuser_index_t __half2index(const __half h) {
nvfuser_index_t result;
__half2int(h, result);
return result;
}
__device__ __inline__ bool __half2bool(const __half h) {
return (bool)__half2float(h) != 0;
}
__device__ __inline__ __half __real_then_2half(const std::complex<float> c) {
return __float2half(std::real(c));
}
__device__ __inline__ __half __real_then_2half(const std::complex<double> c) {
return __double2half(std::real(c));
}
__device__ __inline__ bool __heq(const __half a, const __half b) {
// From cuda_fp16.hpp
unsigned short val;
asm("{ .reg .pred __$temp3;\n"
" setp.eq.f16 __$temp3, %1, %2;\n"
" selp.u16 %0, 1, 0, __$temp3;}"
: "=h"(val)
: "h"(__NVFUSER_HALF_TO_CUS(a)), "h"(__NVFUSER_HALF_TO_CUS(b)));
return (val != 0U) ? true : false;
}
__device__ __inline__ __half operator|(const __half x, const __half y) {
__half val;
asm("{ or.b16 %0, %1, %2;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "h"(__NVFUSER_HALF_TO_CUS(x)), "h"(__NVFUSER_HALF_TO_CUS(y)));
return val;
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#define __NVFUSER_BFLOAT_TO_US(var) *(reinterpret_cast<unsigned short*>(&(var)))
#define __NVFUSER_BFLOAT_TO_CUS(var) \
*(reinterpret_cast<const unsigned short*>(&(var)))
struct __bfloat;
__device__ __inline__ __bfloat __float2bfloat(const float);
struct __align__(2) __bfloat {
__bfloat() = default;
__bfloat(const __bfloat& other) {
__x = other.__x;
}
__bfloat(const __bfloat&& other) {
__x = other.__x;
}
__bfloat(const volatile __bfloat& other) {
__x = other.__x;
}
__bfloat(const volatile __bfloat&& other) {
__x = other.__x;
}
// Note: not returning reference for `__bfloat::operator=`
// Doing so would requires us to return `volatile __bfloat&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __bfloat& other) {
__x = other.__x;
}
__device__ void operator=(const __bfloat&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat&& other) {
__x = other.__x;
}
__device__ void operator=(const __bfloat& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __bfloat&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __bfloat&& other) volatile {
__x = other.__x;
}
__device__ __bfloat(const float f) {
__x = __float2bfloat(f).__x;
}
__device__ uint16_t raw() const {
return __x;
}
protected:
unsigned short __x;
};
__device__ __inline__ __bfloat __float2bfloat(const float f) {
__bfloat val;
asm("{ cvt.rn.bf16.f32 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "f"(f));
return val;
}
__device__ __inline__ __bfloat __double2bfloat(const double d) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.f64 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "d"(d));
return val;
#else
return __float2bfloat(static_cast<float>(d));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const int i) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.s32 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "r"(i));
return val;
#else
return __float2bfloat(static_cast<float>(i));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const int64_t i64) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.s64 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "l"(i64));
return val;
#else
return __float2bfloat(static_cast<float>(i64));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const uint32_t i) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.u32 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "r"(i));
return val;
#else
return __float2bfloat(static_cast<float>(i));
#endif
}
__device__ __inline__ __bfloat __int2bfloat(const uint64_t i64) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.u64 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "l"(i64));
return val;
#else
return __float2bfloat(static_cast<float>(i64));
#endif
}
__device__ __inline__ __bfloat __bool2bfloat(const bool b) {
return __int2bfloat((int)b);
}
__device__ __inline__ float __bfloat2float(const __bfloat h) {
float val;
asm("{ mov.b32 %0, {0,%1};}\n"
: "=f"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
}
__device__ __inline__ double __bfloat2double(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
double val;
asm("{ cvt.f64.bf16 %0, %1;}\n"
: "=d"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<double>(__bfloat2float(h));
#endif
}
__device__ int __bfloat2int32(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int val;
asm("{ cvt.rzi.s32.bf16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int>(__bfloat2float(h));
#endif
}
__device__ __inline__ int64_t __bfloat2int(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int64_t val;
asm("{ cvt.rzi.s64.bf16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int64_t>(__bfloat2float(h));
#endif
}
__device__ int __bfloat2uint32(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int val;
asm("{ cvt.rzi.u32.bf16 %0, %1;}\n"
: "=r"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int>(__bfloat2float(h));
#endif
}
__device__ __inline__ int64_t __bfloat2uint(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
int64_t val;
asm("{ cvt.rzi.u64.bf16 %0, %1;}\n"
: "=l"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return static_cast<int64_t>(__bfloat2float(h));
#endif
}
__device__ __inline__ void __bfloat2int(const __bfloat h, int& output) {
output = __bfloat2int32(h);
}
__device__ __inline__ void __bfloat2int(const __bfloat h, int64_t& output) {
output = __bfloat2int(h);
}
__device__ __inline__ void __bfloat2int(const __bfloat h, uint32_t& output) {
output = __bfloat2uint32(h);
}
__device__ __inline__ void __bfloat2int(const __bfloat h, uint64_t& output) {
output = __bfloat2uint(h);
}
__device__ __inline__ nvfuser_index_t __bfloat2index(
const __bfloat h,
bool& output) {
nvfuser_index_t result;
__bfloat2int(h, result);
return result;
}
__device__ __inline__ bool __bfloat2bool(const __bfloat h) {
return (bool)__bfloat2float(h) != 0;
}
__device__ __inline__ __bfloat __half2bfloat(const __half h) {
#if __CUDA_ARCH__ >= 900
__bfloat val;
asm("{ cvt.rn.bf16.f16 %0, %1;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "h"(__NVFUSER_HALF_TO_CUS(h)));
return val;
#else
return __float2bfloat(__half2float(h));
#endif
}
__device__ __inline__ __half __bfloat2half(const __bfloat h) {
#if __CUDA_ARCH__ >= 900
__half val;
asm("{ cvt.rn.f16.bf16 %0, %1;}\n"
: "=h"(__NVFUSER_HALF_TO_US(val))
: "h"(__NVFUSER_BFLOAT_TO_CUS(h)));
return val;
#else
return __float2half(__bfloat2float(h));
#endif
}
__device__ __inline__ __bfloat __real_then_2bfloat(
const std::complex<float> c) {
return __float2bfloat(std::real(c));
}
__device__ __inline__ __bfloat __real_then_2bfloat(
const std::complex<double> c) {
return __double2bfloat(std::real(c));
}
__device__ __inline__ bool __heq(const __bfloat a, const __bfloat b) {
// From cuda_bf16.hpp
#if __CUDA_ARCH__ >= 900
unsigned short val;
asm("{ .reg .pred __$temp3;\n"
" setp.eq.bf16 __$temp3, %1, %2;\n"
" selp.u16 %0, 1, 0, __$temp3;}"
: "=h"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(a)), "h"(__NVFUSER_BFLOAT_TO_CUS(b)));
#else
unsigned int val;
asm("{.reg .b32 a,b;\n"
" mov.b32 a, {0, %1};\n"
" mov.b32 b, {0, %2};\n"
" set.eq.f32.f32 %0, a, b;}\n"
: "=r"(val)
: "h"(__NVFUSER_BFLOAT_TO_CUS(a)), "h"(__NVFUSER_BFLOAT_TO_CUS(b)));
#endif
return (val != 0U) ? true : false;
}
__device__ __inline__ __bfloat operator|(const __bfloat x, const __bfloat y) {
__bfloat val;
asm("{ or.b16 %0, %1, %2;}\n"
: "=h"(__NVFUSER_BFLOAT_TO_US(val))
: "h"(__NVFUSER_BFLOAT_TO_CUS(x)), "h"(__NVFUSER_BFLOAT_TO_CUS(y)));
return val;
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
struct __e4m3;
__device__ __inline__ __e4m3 __float2e4m3(const float);
__device__ __inline__ __e4m3 __double2e4m3(const double);
struct __align__(1) __e4m3 {
__e4m3() = default;
__e4m3(const __e4m3& other) {
__x = other.__x;
}
__e4m3(const __e4m3&& other) {
__x = other.__x;
}
__e4m3(const volatile __e4m3& other) {
__x = other.__x;
}
__e4m3(const volatile __e4m3&& other) {
__x = other.__x;
}
// Note: not returning reference for `__e4m3::operator=`
// Doing so would requires us to return `volatile __e4m3&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __e4m3& other) {
__x = other.__x;
}
__device__ void operator=(const __e4m3&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3&& other) {
__x = other.__x;
}
__device__ void operator=(const __e4m3& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __e4m3&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e4m3&& other) volatile {
__x = other.__x;
}
__device__ __e4m3(const float f) {
__x = __float2e4m3(f).__x;
}
__device__ __e4m3(const double f) {
__x = __double2e4m3(f).__x;
}
__device__ __e4m3(const int x) : __x(x) {}
__device__ __e4m3(const long long x) : __x(x) {}
__device__ __e4m3(const uint8_t x) : __x(x) {}
__device__ __e4m3(const uint16_t x) : __x(x) {}
__device__ uint8_t raw() const {
return __x;
}
protected:
uint8_t __x;
};
// NOTE [ fp8 cast optimization ]
//
// For simplicity, we only provided fp8 <-> fp32 cast implementation, while
// relying on any other fp cast in the form of target_fp <-> fp32 <-> fp8.
// This avoids the complication of handling hardware specific instructions on
// various compute capabilities.
// But this simplicity could come at the cost of performance. In cuda_fp8.hpp,
// 1. bf16 -> fp8 is done via bf16 -> float -> fp8
// 2. fp16 -> fp8 is done with a conditional
// # if (> sm_89)
// fp16 -> fp8
// # else
// fp16 -> fp32 -> fp8
// # endif
// 3. fp64 -> fp8 is handled explicitly as bitwise operations.
// TODO consider cuda_fp8.hpp for performance optimized cast.
__device__ __inline__ __e4m3 __float2e4m3(const float f) {
constexpr float f_const_zero = 0.f;
unsigned short _tmp_buffer;
__e4m3 val;
asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %1, %2;}"
: "=h"(_tmp_buffer)
: "f"(f_const_zero), "f"(f));
memcpy(&val, &_tmp_buffer, sizeof(uint8_t));
return val;
}
__device__ __inline__ float __e4m32float(const __e4m3 b) {
unsigned short _tmp_buffer;
memcpy(&_tmp_buffer, &b, sizeof(uint8_t));
float val;
asm("{\n\t"
".reg .b32 buf0;\n\t"
"cvt.rn.f16x2.e4m3x2 buf0, %1;\n\t"
"cvt.u16.u32 %1, buf0;\n\t"
"cvt.f32.f16 %0, %1;\n\t"
"}"
: "=f"(val)
: "h"(_tmp_buffer));
return val;
}
__device__ __inline__ __e4m3 __double2e4m3(const double d) {
return __float2e4m3(d);
}
__device__ __inline__ double __e4m32double(const __e4m3 b) {
return __e4m32float(b);
}
__device__ __inline__ __e4m3 __half2e4m3(const __half h) {
return __float2e4m3(__half2float(h));
}
__device__ __inline__ __half __e4m32half(const __e4m3 b) {
return __float2half(__e4m32float(b));
}
__device__ __inline__ __e4m3 __bfloat2e4m3(const __bfloat h) {
return __float2e4m3(__bfloat2float(h));
}
__device__ __inline__ __bfloat __e4m32bfloat(const __e4m3 b) {
return __float2bfloat(__e4m32float(b));
}
__device__ __inline__ __e4m3 operator|(const __e4m3 x, const __e4m3 y) {
unsigned short val;
unsigned short x_val = x.raw();
unsigned short y_val = y.raw();
asm("{ or.b16 %0, %1, %2;}\n" : "=h"(val) : "h"(x_val), "h"(y_val));
return __e4m3(val);
}
struct __e5m2;
__device__ __inline__ __e5m2 __float2e5m2(const float);
__device__ __inline__ __e5m2 __double2e5m2(const double);
struct __align__(1) __e5m2 {
__e5m2() = default;
__e5m2(const __e5m2& other) {
__x = other.__x;
}
__e5m2(const __e5m2&& other) {
__x = other.__x;
}
__e5m2(const volatile __e5m2& other) {
__x = other.__x;
}
__e5m2(const volatile __e5m2&& other) {
__x = other.__x;
}
// Note: not returning reference for `__e5m2::operator=`
// Doing so would requires us to return `volatile __e5m2&` for the volatile
// variants, which would trigger a gcc warning `implicit dereference will not
// access object of type ‘volatile S’ in statement`
__device__ void operator=(const __e5m2& other) {
__x = other.__x;
}
__device__ void operator=(const __e5m2&& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2& other) {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2&& other) {
__x = other.__x;
}
__device__ void operator=(const __e5m2& other) volatile {
__x = other.__x;
}
__device__ void operator=(const __e5m2&& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2& other) volatile {
__x = other.__x;
}
__device__ void operator=(const volatile __e5m2&& other) volatile {
__x = other.__x;
}
__device__ __e5m2(const float f) {
__x = __float2e5m2(f).__x;
}
__device__ __e5m2(const double f) {
__x = __double2e5m2(f).__x;
}
__device__ __e5m2(const int x) : __x(x) {}
__device__ __e5m2(const long long x) : __x(x) {}
__device__ __e5m2(const uint8_t x) : __x(x) {}
__device__ __e5m2(const uint16_t x) : __x(x) {}
__device__ uint8_t raw() const {
return __x;
}
protected:
uint8_t __x;
};
// see NOTE [ fp8 cast optimization ]
__device__ __inline__ __e5m2 __float2e5m2(const float f) {
constexpr float f_const_zero = 0.f;
unsigned short _tmp_buffer;
__e5m2 val;
asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %1, %2;}"
: "=h"(_tmp_buffer)
: "f"(f_const_zero), "f"(f));
memcpy(&val, &_tmp_buffer, sizeof(uint8_t));
return val;
}
__device__ __inline__ float __e5m22float(const __e5m2 b) {
unsigned short _tmp_buffer;
memcpy(&_tmp_buffer, &b, sizeof(uint8_t));
float val;
asm("{\n\t"
".reg .b32 buf0;\n\t"
"cvt.rn.f16x2.e5m2x2 buf0, %1;\n\t"
"cvt.u16.u32 %1, buf0;\n\t"
"cvt.f32.f16 %0, %1;\n\t"
"}"
: "=f"(val)
: "h"(_tmp_buffer));
return val;
}
__device__ __inline__ __e5m2 __double2e5m2(const double f) {
return __float2e5m2(f);
}
__device__ __inline__ double __e5m22double(const __e5m2 b) {
return __e5m22float(b);
}
__device__ __inline__ __e5m2 __half2e5m2(const __half h) {
return __float2e5m2(__half2float(h));
}
__device__ __inline__ __half __e5m22half(const __e5m2 b) {
return __float2half(__e5m22float(b));
}
__device__ __inline__ __e5m2 __bfloat2e5m2(const __bfloat h) {
return __float2e5m2(__bfloat2float(h));
}
__device__ __inline__ __bfloat __e5m22bfloat(const __e5m2 b) {
return __float2bfloat(__e5m22float(b));
}
__device__ __inline__ __e5m2 operator|(const __e5m2 x, const __e5m2 y) {
unsigned short val;
unsigned short x_val = x.raw();
unsigned short y_val = y.raw();
asm("{ or.b16 %0, %1, %2;}\n" : "=h"(val) : "h"(x_val), "h"(y_val));
return __e5m2(val);
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Type trait utils
template <typename Type, bool is_volatile>
struct MaybeVolatile;
template <typename Type>
struct MaybeVolatile<Type, true> {
using type = volatile Type;
};
template <typename Type>
struct MaybeVolatile<Type, false> {
using type = Type;
};
template <typename... Types>
struct TypeList {};
template <int idx, typename T, typename... Types>
struct TypeSelector {
using type = typename TypeSelector<idx - 1, Types...>::type;
};
template <typename T, typename... Types>
struct TypeSelector<0, T, Types...> {
using type = T;
};
template <typename T0, typename T1>
struct IsSameType {
static constexpr bool value = false;
};
template <typename T0>
struct IsSameType<T0, T0> {
static constexpr bool value = true;
};
template <typename T>
struct IsPointerType {
static constexpr bool value = false;
};
template <typename T>
struct IsPointerType<T*> {
static constexpr bool value = true;
};
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// aligned register array for vectorized load/store
template <typename scalar_t, int size, int align_size = 1>
struct alignas(sizeof(scalar_t) * align_size) Array {
scalar_t array[size];
__device__ void set(scalar_t v) {
#pragma unroll
for (int i = 0; i < size; ++i) {
array[i] = v;
}
}
__device__ scalar_t& operator[](const unsigned int i) {
return array[i];
}
__device__ const scalar_t& operator[](const unsigned int i) const {
return array[i];
}
Array& operator=(const Array& a) {
#pragma unroll
for (int i = 0; i < size; ++i) {
array[i] = a[i];
}
return *this;
}
};
// Used for vectorized allocations that are not in registers
template <typename scalar_t, int vec_size>
__device__ void arraySet(scalar_t* buff, scalar_t val) {
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
buff[i] = val;
}
}
template <typename scalar_t, int vec_size>
__device__ void loadGeneric(scalar_t* to, scalar_t* from) {
// It would be really nice to use memcpy here, but one example was failing
// with:
//
// memcpy(to, from, vec_size * sizeof(scalar_t));
//
// Yet passing with:
//
// for(int i = 0; i < vec_size; i++){
// to[i] = from[i];
// }
switch (sizeof(scalar_t) * vec_size) {
case 1:
*reinterpret_cast<uchar1*>(to) = *reinterpret_cast<uchar1*>(from);
break;
case 2:
*reinterpret_cast<uchar2*>(to) = *reinterpret_cast<uchar2*>(from);
break;
case 4:
*reinterpret_cast<uint1*>(to) = *reinterpret_cast<uint1*>(from);
break;
case 8:
*reinterpret_cast<uint2*>(to) = *reinterpret_cast<uint2*>(from);
break;
case 12:
*reinterpret_cast<uint3*>(to) = *reinterpret_cast<uint3*>(from);
break;
case 16:
*reinterpret_cast<uint4*>(to) = *reinterpret_cast<uint4*>(from);
break;
}
}
// Volatile version only works with c++ fundamnetal types
template <
typename scalar_t,
int vec_size,
bool is_volatile_to,
bool is_volatile_from>
__device__ void loadGenericVolatile(
typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
// Reinterpret cast like this with volatile types only works for C++
// fundamental types otherwise the = operator is not defined
case 1:
*reinterpret_cast<
typename MaybeVolatile<unsigned char, is_volatile_to>::type*>(to) =
*reinterpret_cast<
typename MaybeVolatile<unsigned char, is_volatile_from>::type*>(
from);
break;
case 2:
*reinterpret_cast<typename MaybeVolatile<short, is_volatile_to>::type*>(
to) =
*reinterpret_cast<
typename MaybeVolatile<short, is_volatile_from>::type*>(from);
break;
case 4:
*reinterpret_cast<
typename MaybeVolatile<unsigned int, is_volatile_to>::type*>(to) =
*reinterpret_cast<
typename MaybeVolatile<unsigned int, is_volatile_from>::type*>(
from);
break;
case 8:
*reinterpret_cast<typename MaybeVolatile<double, is_volatile_to>::type*>(
to) =
*reinterpret_cast<
typename MaybeVolatile<double, is_volatile_from>::type*>(from);
break;
}
}
template <typename scalar_t, int vec_size, bool is_volatile>
__device__ void loadLocalToGlobal(
typename MaybeVolatile<scalar_t, is_volatile>::type* to,
scalar_t* from) {
switch (sizeof(scalar_t) * vec_size) {
case 1:
case 2:
case 4:
loadGenericVolatile<scalar_t, vec_size, is_volatile, false>(to, from);
break;
case 8: {
uint2 const& data = *reinterpret_cast<uint2*>(from);
if (is_volatile) {
asm volatile(
"st.volatile.global.v2.s32 [%0], {%1,%2};" ::"l"(
(typename MaybeVolatile<uint2, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y));
} else {
asm volatile(
"st.global.cs.v2.s32 [%0], {%1,%2};" ::"l"(
(typename MaybeVolatile<uint2, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y));
}
break;
}
case 16: {
uint4 const& data = *reinterpret_cast<uint4*>(from);
if (is_volatile) {
asm volatile(
"st.volatile.global.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
(typename MaybeVolatile<uint4, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y),
"r"(data.z),
"r"(data.w));
} else {
asm volatile(
"st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" ::"l"(
(typename MaybeVolatile<uint4, is_volatile>::type*)to),
"r"(data.x),
"r"(data.y),
"r"(data.z),
"r"(data.w));
}
break;
}
}
}
// This is copied from csrc/type.h and should be kept consistent.
enum class CacheOp {
AllLevels,
Streaming,
Global,
};
template <typename T, CacheOp cache_op>
__device__ void loadGlobalToLocalCached(void* to, void* from) {
T* typed_to = reinterpret_cast<T*>(to);
T* typed_from = reinterpret_cast<T*>(from);
switch (cache_op) {
case CacheOp::AllLevels:
*typed_to = __ldca(typed_from);
break;
case CacheOp::Streaming:
*typed_to = __ldcs(typed_from);
break;
case CacheOp::Global:
*typed_to = __ldcg(typed_from);
break;
}
}
// For simplicity, cache_op is only used for non-volatile loads written in
// inline assembly. Other loads are done with the default cache operator --
// cache all levels. ld.volatile doesn't accept cache operator anyway.
template <typename scalar_t, int vec_size, bool is_volatile, CacheOp cache_op>
__device__ void loadGlobalToLocal(
scalar_t* to,
typename MaybeVolatile<scalar_t, is_volatile>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
case 1:
case 2:
case 4:
loadGenericVolatile<scalar_t, vec_size, false, is_volatile>(to, from);
break;
case 8: {
if (is_volatile) {
uint2& data = *reinterpret_cast<uint2*>(to);
asm volatile("ld.volatile.global.v2.s32 {%0,%1}, [%2];"
: "=r"(data.x), "=r"(data.y)
: "l"((uint2*)from));
} else {
loadGlobalToLocalCached<uint2, cache_op>(
to, const_cast<scalar_t*>(from));
}
break;
}
case 16: {
if (is_volatile) {
uint4& data = *reinterpret_cast<uint4*>(to);
asm volatile("ld.volatile.global.v4.s32 {%0,%1,%2,%3}, [%4];"
: "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
: "l"((uint4*)from));
} else {
loadGlobalToLocalCached<uint4, cache_op>(
to, const_cast<scalar_t*>(from));
}
break;
}
}
}
template <
typename scalar_t,
int vec_size,
bool is_volatile_to,
bool is_volatile_from>
__device__ void loadGlobalToGlobal(
typename MaybeVolatile<scalar_t, is_volatile_to>::type* to,
typename MaybeVolatile<scalar_t, is_volatile_from>::type* from) {
switch (sizeof(scalar_t) * vec_size) {
// Reinterpret cast like this with volatile types only works for C++
// fundamental types otherwise the = operator is not defined
case 1:
case 2:
case 4:
case 8:
loadGenericVolatile<scalar_t, vec_size, is_volatile_to, is_volatile_from>(
to, from);
break;
case 12: {
uint3 local_intermediate;
loadGlobalToLocal<
scalar_t,
vec_size,
is_volatile_from,
CacheOp::Streaming>(
reinterpret_cast<scalar_t*>(&local_intermediate), from);
loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
to, reinterpret_cast<scalar_t*>(&local_intermediate));
break;
}
case 16: {
uint4 local_intermediate;
loadGlobalToLocal<
scalar_t,
vec_size,
is_volatile_from,
CacheOp::Streaming>(
reinterpret_cast<scalar_t*>(&local_intermediate), from);
loadLocalToGlobal<scalar_t, vec_size, is_volatile_to>(
to, reinterpret_cast<scalar_t*>(&local_intermediate));
break;
}
}
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// TMemTensor is a wrapper around a uint32_t that provides a convenient way to
// manipulate tensor memory addresses. Example usage:
// TMemTensor T0(0x12345678):
// -> address (lane=0x1234, col=0x5678):
// TMemTensor T1 = T0 + {64, 64}:
// -> address (lane=T0.lane+64, col=T0.col+64)
// TMemTensor T2(0x12345678, 32, 32):
// -> address (lane=0x1234+32, col=0x5678+32)
struct TMemTensor {
uint32_t raw_address;
public:
uint32_t static add(uint32_t base, Array<uint16_t, 2> offset) {
// Mentally, it makes more sense to think of TMem address as (lane, column)
// but because GPUs are little-endian, the address is stored in reverse
// order as (column, lane). So we swap the order of the offset before adding
// it to the base address.
uint16_t tmp = offset[0];
offset[0] = offset[1];
offset[1] = tmp;
return base + *reinterpret_cast<const uint32_t*>(&offset);
}
TMemTensor(uint32_t raw_address) : raw_address(raw_address) {}
TMemTensor(uint32_t base_address, uint16_t lane_offset, uint16_t col_offset)
: raw_address(add(base_address, {lane_offset, col_offset})) {}
operator uint32_t() const {
return raw_address;
}
uint32_t operator+(Array<uint16_t, 2> offset) const {
return add(raw_address, offset);
}
};
static_assert(
sizeof(TMemTensor) == sizeof(uint32_t),
"TMemTensor must be a uint32_t");
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
template <typename T, int Dims, int AllocDims = Dims>
struct Tensor {
__device__ T& operator[](nvfuser_index_t ind) {
return data[ind];
};
T* data;
Array<nvfuser_index_t, Dims, 1> logical_size;
Array<nvfuser_index_t, AllocDims, 1> alloc_stride;
};
// Specialization for 0-dim case as it does not need size and stride arrays.
// They will be an error as well since zero-length arrays are not allowed.
template <typename T>
struct Tensor<T, 0> {
__device__ T& operator[](nvfuser_index_t i) {
return *data;
};
T* data;
};
// Specialization for 0-dim case that's easy to pass in a CPU based tensor.
template <typename T>
struct CpuScalarTensor {
__device__ T& operator[](int i) {
return data;
};
T data;
};
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
__device__ unsigned int mulhilo32(
unsigned int a,
unsigned int b,
unsigned int* result_high) {
*result_high = __umulhi(a, b);
return a * b;
}
__device__ Array<uint32_t, 4> single_round(
Array<uint32_t, 4> ctr,
Array<uint32_t, 2> key) {
constexpr unsigned long kPhiloxSA = 0xD2511F53;
constexpr unsigned long kPhiloxSB = 0xCD9E8D57;
unsigned int hi0;
unsigned int hi1;
unsigned int lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0);
unsigned int lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1);
Array<uint32_t, 4> ret = {
hi1 ^ ctr[1] ^ key[0], lo1, hi0 ^ ctr[3] ^ key[1], lo0};
return ret;
}
__device__ Array<uint32_t, 4> philox(
unsigned long long seed,
unsigned long long subsequence,
unsigned long long offset) {
constexpr unsigned long kPhilox10A = 0x9E3779B9;
constexpr unsigned long kPhilox10B = 0xBB67AE85;
Array<uint32_t, 2> key;
key[0] = (unsigned int)seed;
key[1] = (unsigned int)(seed >> 32);
Array<uint32_t, 4> counter;
counter[0] = (unsigned int)(offset);
counter[1] = (unsigned int)(offset >> 32);
counter[2] = (unsigned int)(subsequence);
counter[3] = (unsigned int)(subsequence >> 32);
Array<uint32_t, 4> output = {};
Array<uint32_t, 2> key_ = key;
Array<uint32_t, 4> counter_ = counter;
for (int i = 0; i < 9; i++) {
counter_ = single_round(counter_, key_);
key_[0] += (kPhilox10A);
key_[1] += (kPhilox10B);
}
output = single_round(counter_, key_);
return output;
}
// This is a uniform double in the range (0, 1]
__device__ double raw_uniform_double(unsigned int x, unsigned int y) {
constexpr double scale = 1.0 / (double)(1ll << 53);
const unsigned long long z =
(unsigned long long)x ^ ((unsigned long long)y << (53 - 32));
return (double)z * scale + 0.5 * scale;
}
// This is a uniform float in the range (0, 1]
__device__ float raw_uniform_float(unsigned int x) {
constexpr float scale = (float)(1.0 / (double)(1ll << 32));
return (float)x * scale + 0.5f * scale;
}
__device__ __half uniform_half(unsigned int x) {
__half result = __float2half(raw_uniform_float(x));
return __heq(result, __float2half(1.0f)) ? __float2half(0.0f) : result;
}
__device__ __bfloat uniform_bfloat(unsigned int x) {
__bfloat result = __float2bfloat(raw_uniform_float(x));
return __heq(result, __float2bfloat(1.0f)) ? __float2bfloat(0.0f) : result;
}
__device__ float uniformf(unsigned int x) {
float result = raw_uniform_float(x);
return result == 1.0f ? 0.0f : result;
}
__device__ double uniform(unsigned int x, unsigned int y) {
double result = raw_uniform_double(x, y);
return result == 1.0 ? 0.0 : result;
}
__device__ double rng_uniform(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return uniform(
rng_result[rng_component * 2], rng_result[rng_component * 2 + 1]);
}
__device__ float rng_uniformf(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return uniformf(rng_result[rng_component]);
}
__device__ __half
rng_uniform_half(const Array<uint32_t, 4>& rng_result, int rng_component) {
return uniform_half(rng_result[rng_component]);
}
__device__ __bfloat
rng_uniform_bfloat(const Array<uint32_t, 4>& rng_result, int rng_component) {
return uniform_bfloat(rng_result[rng_component]);
}
__device__ double rng_uniform_range(
const Array<uint32_t, 4>& rng_result,
int rng_component,
double from,
double to) {
auto range = to - from;
auto uniform01 = rng_uniform(rng_result, rng_component);
return from + range * uniform01;
}
__device__ float rng_uniform_rangef(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float from,
float to) {
auto range = to - from;
auto uniform01 = rng_uniformf(rng_result, rng_component);
return from + range * uniform01;
}
__device__ __half rng_uniform_range_half(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float from,
float to) {
auto range = to - from;
float uniform01 = raw_uniform_float(rng_result[rng_component]);
__half result = __float2half(from + range * uniform01);
return __heq(result, __float2half(to)) ? __float2half(from) : result;
}
__device__ __bfloat rng_uniform_range_bfloat(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float from,
float to) {
auto range = to - from;
float uniform01 = raw_uniform_float(rng_result[rng_component]);
__bfloat result = __float2bfloat(from + range * uniform01);
return __heq(result, __float2bfloat(to)) ? __float2bfloat(from) : result;
}
__device__ float normalf(unsigned int x, unsigned int y, int rng_component) {
float u = uniformf(x);
float v = uniformf(y) * 6.2831855f;
if (rng_component % 2 == 0) {
return sqrtf(-2.0f * logf(u)) * sinf(v);
} else {
return sqrtf(-2.0f * logf(u)) * cosf(v);
}
}
__device__ double normal(
unsigned int x0,
unsigned int x1,
unsigned int y0,
unsigned int y1,
int rng_component) {
double u = uniform(x0, x1);
double v = uniform(y0, y1) * 6.2831853071795860;
if (rng_component % 2 == 0) {
return sqrt(-2.0 * log(u)) * sin(v);
} else {
return sqrt(-2.0 * log(u)) * cos(v);
}
}
__device__ double rng_normal_standard(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return normal(
rng_result[0],
rng_result[1],
rng_result[2],
rng_result[3],
rng_component);
}
__device__ float rng_normal_standardf(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component);
}
__device__ __half rng_normal_standard_half(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return __float2half(normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component));
}
__device__ __bfloat rng_normal_standard_bfloat(
const Array<uint32_t, 4>& rng_result,
int rng_component) {
return __float2bfloat(normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component));
}
__device__ double rng_normal_general(
const Array<uint32_t, 4>& rng_result,
int rng_component,
double mean,
double std) {
auto normal01 = rng_normal_standard(rng_result, rng_component);
return normal01 * std + mean;
}
__device__ float rng_normal_generalf(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float mean,
float std) {
auto normal01 = rng_normal_standardf(rng_result, rng_component);
return normal01 * std + mean;
}
__device__ __half rng_normal_general_half(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float mean,
float std) {
auto normal01 = normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component);
return __float2half(normal01 * std + mean);
}
__device__ __bfloat rng_normal_general_bfloat(
const Array<uint32_t, 4>& rng_result,
int rng_component,
float mean,
float std) {
auto normal01 = normalf(
rng_result[rng_component / 2 * 2],
rng_result[1 + rng_component / 2 * 2],
rng_component);
return __float2bfloat(normal01 * std + mean);
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
#define NVFUSER_DEFINE_MAGIC_ZERO \
__shared__ int nvfuser_zero_s; \
if (threadIdx.x == 0) \
nvfuser_zero_s = 0; \
__syncthreads(); \
atomicMin(&nvfuser_zero_s, threadIdx.x); \
int nvfuser_zero = nvfuser_zero_s;
#define NVFUSER_UPDATE_MAGIC_ZERO \
do { \
nvfuser_zero <<= 1; \
} while (0);
#ifdef __NVCC__
#include <assert.h>
#endif // __NVCC__
__device__ constexpr int ceilDiv(int a, int b) {
return (a + b - 1) / b;
}
__device__ constexpr int64_t ceilDiv(int64_t a, int64_t b) {
return (a + b - 1) / b;
}
__device__ constexpr int64_t ceilDiv(int64_t a, int b) {
return ceilDiv(a, (int64_t)b);
}
__device__ constexpr int64_t ceilDiv(int a, int64_t b) {
return ceilDiv((int64_t)a, b);
}
__device__ constexpr double ceilDiv(double a, double b) {
return std::ceil(a / b);
}
__device__ constexpr double ceilDiv(double a, int64_t b) {
return std::ceil(a / b);
}
__device__ constexpr double ceilDiv(int64_t a, double b) {
return std::ceil(a / b);
}
// Monotonic and precise lerp is described here:
// https://math.stackexchange.com/a/1798323
__device__ double lerp(double start, double end, double weight) {
if (weight < 0.5) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0 - weight);
}
}
__device__ float lerp(float start, float end, float weight) {
if (weight < 0.5f) {
return start + weight * (end - start);
} else {
return end - (end - start) * (1.0f - weight);
}
}
__device__ float lerp(float start, float end, double weight) {
return lerp(start, end, static_cast<float>(weight));
}
__device__ constexpr int max(int a, int b) {
return a > b ? a : b;
}
__device__ constexpr int64_t max(int64_t a, int b) {
return a > (int64_t)b ? a : (int64_t)b;
}
__device__ constexpr int64_t max(int a, int64_t b) {
return (int64_t)a > b ? (int64_t)a : b;
}
__device__ constexpr int64_t max(int64_t a, int64_t b) {
return a > b ? a : b;
}
__device__ double fmax(double a, double b) {
// check and propagate NaN
if (a != a) {
return a;
} else { // If b is nan, it will be returned in the next line
return a > b ? a : b;
}
}
__device__ float fmax(float a, float b) {
// check and propagate NaN
if (a != a) {
return a;
} else { // If b is nan, it will be returned in the next line
return a > b ? a : b;
}
}
__device__ constexpr int min(int a, int b) {
return a > b ? b : a;
}
__device__ constexpr int64_t min(int64_t a, int b) {
return (int64_t)a > b ? b : (int64_t)a;
}
__device__ constexpr int64_t min(int a, int64_t b) {
return a > (int64_t)b ? (int64_t)b : a;
}
__device__ constexpr int64_t min(int64_t a, int64_t b) {
return a > b ? b : a;
}
__device__ double fmin(double a, double b) {
// check and propagate NaN
if (b != b) {
return b;
} else { // If a is nan, it will be returned in the next line
return a > b ? b : a;
}
}
__device__ float fmin(float a, float b) {
// check and propagate NaN
if (b != b) {
return b;
} else { // If a is nan, it will be returned in the next line
return a > b ? b : a;
}
}
__device__ constexpr int alignBufferSize(int buffer, int size) {
return (buffer + (size - 1)) & ~(size - 1);
}
__device__ double clamp(double x, double minv, double maxv) {
return fmin(fmax(x, minv), maxv);
}
__device__ float clamp(float x, double minv, double maxv) {
return fmin(fmax((double)x, minv), maxv);
}
__device__ int clamp(int x, int64_t minv, int64_t maxv) {
return min(max((int64_t)x, minv), maxv);
}
__device__ int64_t clamp(int64_t x, int64_t minv, int64_t maxv) {
return min(max(x, minv), maxv);
}
__device__ double frac(double x) {
return x - trunc(x);
}
__device__ float frac(float x) {
return x - trunc(x);
}
__device__ double reciprocal(double x) {
return 1 / x;
}
__device__ float reciprocal(float x) {
return 1 / x;
}
__device__ double relu(double x) {
return x <= 0 ? 0 : x;
}
__device__ float relu(float x) {
return x <= 0 ? 0 : x;
}
__device__ float relu(int64_t x) {
return x <= 0 ? 0 : x;
}
__device__ float relu(int x) {
return x <= 0 ? 0 : x;
}
__device__ double remainder(double a, double b) {
auto mod = ::fmod(a, b);
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ float remainder(float a, float b) {
auto mod = ::fmod(a, b);
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ double sigmoid(double x) {
return 1.0 / (1.0 + exp(-x));
}
__device__ float sigmoid(float x) {
return 1.0f / (1.0f + exp(-x));
}
__device__ double silu(double x) {
return x * sigmoid(x);
}
__device__ float silu(float x) {
return x * sigmoid(x);
}
__device__ double threshold(double x, double t, double v) {
return x <= t ? v : x;
}
__device__ float threshold(float x, double t, double v) {
return x <= t ? v : x;
}
__device__ int threshold(int x, int64_t t, int64_t v) {
return x <= t ? v : x;
}
__device__ int64_t threshold(int64_t x, int64_t t, int64_t v) {
return x <= t ? v : x;
}
__device__ constexpr int64_t remainder(int64_t a, int64_t b) {
auto mod = a % b;
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ constexpr int remainder(int a, int b) {
auto mod = a % b;
if ((mod != 0) && ((b < 0) != (mod < 0)))
mod += b;
return mod;
}
__device__ constexpr int64_t fmod(int64_t a, int64_t b) {
return a % b;
}
__device__ constexpr int fmod(int a, int b) {
return a % b;
}
__device__ constexpr double fmod(double a, double b) {
return ::fmod(a, b);
}
__device__ constexpr float fmod(float a, float b) {
return ::fmod(a, b);
}
__device__ constexpr double nextafter(double a, double b) {
return ::nextafter(a, b);
}
__device__ constexpr float nextafter(float a, float b) {
return ::nextafterf(a, b);
}
template <typename T>
__device__ T pow(T a, T b) {
if (b < 0) {
if (a == 1) {
return 1;
} else if (a == -1) {
auto negative = (-b) % static_cast<T>(2);
return negative ? -1 : 1;
} else {
return 0;
}
} else {
T result = 1;
while (b) {
if (b & 1) {
result *= a;
}
b /= 2;
a *= a;
}
return result;
}
}
template __device__ int pow<int>(int a, int b);
template __device__ int64_t pow<int64_t>(int64_t a, int64_t b);
template <>
__device__ float pow<float>(float a, float b) {
return ::pow(a, b);
}
template <>
__device__ double pow<double>(double a, double b) {
return ::pow(a, b);
}
__device__ float pow(float a, int b) {
return pow(a, (float)b);
}
__device__ double pow(double a, int b) {
return pow(a, (double)b);
}
__device__ float pow(float a, int64_t b) {
return pow(a, (float)b);
}
__device__ double pow(double a, int64_t b) {
return pow(a, (double)b);
}
__device__ int64_t pow(int64_t a, int b) {
return pow(a, (int64_t)b);
}
__device__ int64_t pow(int a, int64_t b) {
return pow((int64_t)a, b);
}
__device__ double rsqrt(double z) {
return ::rsqrt(z);
}
__device__ float rsqrt(float z) {
return ::rsqrtf(z);
}
__device__ int rsqrt(int z) {
return ::rsqrtf((float)z);
}
__device__ int64_t rsqrt(int64_t z) {
return ::rsqrt((double)z);
}
__device__ double signbit(double a) {
return ::signbit(a);
}
__device__ float signbit(float a) {
return ::signbit(a);
}
__device__ int signbit(int a) {
return a < 0;
}
__device__ int64_t signbit(int64_t a) {
return a < 0;
}
// Reference:
// https://en.wikipedia.org/wiki/Euclidean_algorithm#Implementations
// https://github.com/pytorch/pytorch/blob/c9f4f01981fd73fcc7c27676cc50230cd1b5bc22/aten/src/ATen/native/Math.h#L1232
template <typename T>
__device__ T gcd(T a, T b) {
a = abs(a);
b = abs(b);
while (b != 0) {
auto t = b;
b = a % b;
a = t;
}
return a;
}
template <typename T>
bool isfinite(T x) {
return ::isfinite(x);
}
// ref:
// https://github.com/NVIDIA/cutlass/blob/6fbc0d33800008d3180d3fefed4e1a653e5f72a0/include/cutlass/bfloat16.h#L213
template <>
bool isfinite<__bfloat>(__bfloat x) {
const auto exponent_biased = int((x.raw() >> 7) & 0x0ff);
return exponent_biased != 0x0ff;
}
// ref:
// https://github.com/NVIDIA/cutlass/blob/6fbc0d33800008d3180d3fefed4e1a653e5f72a0/include/cutlass/half.h#L511
template <>
bool isfinite<__half>(__half x) {
const auto exponent_biased = int((x.raw() >> 10) & 0x1f);
return exponent_biased != 0x1f;
}
template <typename T>
bool isinf(T x) {
return ::isinf(x);
}
////////////////////////////////////////////////////////////
// TODO: the following overloads are only needed for CUDA //
// 10.2 Please remove when CUDA 10.2 support is dropped //
////////////////////////////////////////////////////////////
bool isinf(int64_t x) {
return false;
}
bool isinf(int x) {
return false;
}
bool isinf(short x) {
return false;
}
bool isinf(char x) {
return false;
}
bool isinf(unsigned char x) {
return false;
}
bool isinf(bool x) {
return false;
}
bool isfinite(int64_t x) {
return true;
}
bool isfinite(int x) {
return true;
}
bool isfinite(short x) {
return true;
}
bool isfinite(char x) {
return true;
}
bool isfinite(unsigned char x) {
return true;
}
bool isfinite(bool x) {
return true;
}
////////////////////////////////////////////////////////////
// End TODO //
////////////////////////////////////////////////////////////
template <typename T>
bool isnan(T x) {
return x != x;
}
template <typename T>
bool isneginf(T x) {
return x < 0 && isinf(x);
}
template <typename T>
bool isposinf(T x) {
return x > 0 && isinf(x);
}
template <typename T>
bool isreal(T x) {
return true;
}
// Return the current value of the cycle counter
__device__ inline int64_t readCycleCounter() {
// Ensures preceding memory operations are completed. Doing this
// would make sense for measuring elapsed times enclosed with this
// function.
__threadfence();
return clock64();
}
__device__ float print_impl(const char* name, float value) {
printf(
"%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ double print_impl(const char* name, double value) {
printf(
"%s = %lf @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ int print_impl(const char* name, int value) {
printf(
"%s = %d @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ int64_t print_impl(const char* name, int64_t value) {
printf(
"%s = %ld @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value,
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ bool print_impl(const char* name, bool value) {
printf(
"%s = %s @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
value ? "true" : "false",
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
__device__ __half print_impl(const char* name, __half value) {
printf(
"%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
__half2float(value),
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
#if __CUDACC_VER_MAJOR__ >= 11
__device__ __bfloat print_impl(const char* name, __bfloat value) {
printf(
"%s = %f @ threadIdx=(%d,%d,%d), blockIdx=(%d,%d,%d)\n",
name,
__bfloat2float(value),
(int)threadIdx.x,
(int)threadIdx.y,
(int)threadIdx.z,
(int)blockIdx.x,
(int)blockIdx.y,
(int)blockIdx.z);
return value;
}
#endif
#define print(...) print_impl(#__VA_ARGS__, (__VA_ARGS__))
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace index_utils {
// Utility functions
// Total size of provided dimension
template <typename _dim3>
__device__ __forceinline__ nvfuser_index_t size(const _dim3& d) {
return (nvfuser_index_t)d.x * (nvfuser_index_t)d.y * (nvfuser_index_t)d.z;
}
// Linearized indexing of idx based on dim, if bool==false that dimension does
// not participate
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t maskedOffset(const _dim3& idx, const _dim3_2& dim) {
nvfuser_index_t offset = 0;
if (Z)
offset += idx.z;
if (Y)
offset = offset * dim.y + idx.y;
if (X)
offset = offset * dim.x + idx.x;
return offset;
}
// Linearized indexing of idx based on dim. All dimensions participate.
template <typename _dim3, typename _dim3_2>
__device__ nvfuser_index_t offset(const _dim3& idx, const _dim3_2& dim) {
nvfuser_index_t offset = idx.z;
offset = offset * dim.y + idx.y;
offset = offset * dim.x + idx.x;
return offset;
}
// Masks the provided dim3, those == false get truncated to 1
template <bool X, bool Y, bool Z, typename _dim3>
__device__ dim3 maskedDims(const _dim3& dim) {
return dim3{
X ? (unsigned)dim.x : 1U,
Y ? (unsigned)dim.y : 1U,
Z ? (unsigned)dim.z : 1U};
}
// Provides total size of dim with masking, those dims == false do not
// participate in the size calculation
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__device__ nvfuser_index_t maskedSize(const _dim3& dim) {
return size(maskedDims<X_BLOCK, Y_BLOCK, Z_BLOCK>(dim));
}
// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3>
__device__ bool maskedIsZero(const _dim3& idx) {
bool isZero = true;
if (X)
isZero = isZero && idx.x == 0;
if (Y)
isZero = isZero && idx.y == 0;
if (Z)
isZero = isZero && idx.z == 0;
return isZero;
}
// Checks if provided idx is zero on those dims == true
template <bool X, bool Y, bool Z, typename _dim3, typename _dim3_2>
__device__ bool maskedIsLast(const _dim3& idx, const _dim3_2& dim) {
bool isZero = true;
if (X)
isZero = isZero && idx.x == dim.x - 1;
if (Y)
isZero = isZero && idx.y == dim.y - 1;
if (Z)
isZero = isZero && idx.z == dim.z - 1;
return isZero;
}
} // namespace index_utils
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// std::tuple-like type
template <typename... Types>
struct Tuple;
#define TUPLE_INCREMENT_PTR(idx) \
do { \
static_assert( \
IsPointerType<T##idx>::value, "Invalid for non-pointer types"); \
val##idx += offset; \
} while (0)
template <typename T0>
struct Tuple<T0> {
T0 val0;
Tuple() = default;
__device__ Tuple(T0 _val0) : val0(_val0) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
}
};
template <typename T0, typename T1>
struct Tuple<T0, T1> {
T0 val0;
T1 val1;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1) : val0(_val0), val1(_val1) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
}
};
template <typename T0, typename T1, typename T2>
struct Tuple<T0, T1, T2> {
T0 val0;
T1 val1;
T2 val2;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2)
: val0(_val0), val1(_val1), val2(_val2) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
}
};
template <typename T0, typename T1, typename T2, typename T3>
struct Tuple<T0, T1, T2, T3> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2, T3 _val3)
: val0(_val0), val1(_val1), val2(_val2), val3(_val3) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
}
};
template <typename T0, typename T1, typename T2, typename T3, typename T4>
struct Tuple<T0, T1, T2, T3, T4> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2, T3 _val3, T4 _val4)
: val0(_val0), val1(_val1), val2(_val2), val3(_val3), val4(_val4) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5>
struct Tuple<T0, T1, T2, T3, T4, T5> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
Tuple() = default;
__device__ Tuple(T0 _val0, T1 _val1, T2 _val2, T3 _val3, T4 _val4, T5 _val5)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6>
struct Tuple<T0, T1, T2, T3, T4, T5, T6> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
T6 val6;
Tuple() = default;
__device__ Tuple(
T0 _val0,
T1 _val1,
T2 _val2,
T3 _val3,
T4 _val4,
T5 _val5,
T6 _val6)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5),
val6(_val6) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
TUPLE_INCREMENT_PTR(6);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6,
typename T7>
struct Tuple<T0, T1, T2, T3, T4, T5, T6, T7> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
T6 val6;
T7 val7;
Tuple() = default;
__device__ Tuple(
T0 _val0,
T1 _val1,
T2 _val2,
T3 _val3,
T4 _val4,
T5 _val5,
T6 _val6,
T7 _val7)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5),
val6(_val6),
val7(_val7) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
TUPLE_INCREMENT_PTR(6);
TUPLE_INCREMENT_PTR(7);
}
};
template <
typename T0,
typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6,
typename T7,
typename T8,
typename T9,
typename T10,
typename T11,
typename T12,
typename T13,
typename T14,
typename T15>
struct Tuple<
T0,
T1,
T2,
T3,
T4,
T5,
T6,
T7,
T8,
T9,
T10,
T11,
T12,
T13,
T14,
T15> {
T0 val0;
T1 val1;
T2 val2;
T3 val3;
T4 val4;
T5 val5;
T6 val6;
T7 val7;
T8 val8;
T9 val9;
T10 val10;
T11 val11;
T12 val12;
T13 val13;
T14 val14;
T15 val15;
Tuple() = default;
__device__ Tuple(
T0 _val0,
T1 _val1,
T2 _val2,
T3 _val3,
T4 _val4,
T5 _val5,
T6 _val6,
T7 _val7,
T8 _val8,
T9 _val9,
T10 _val10,
T11 _val11,
T12 _val12,
T13 _val13,
T14 _val14,
T15 _val15)
: val0(_val0),
val1(_val1),
val2(_val2),
val3(_val3),
val4(_val4),
val5(_val5),
val6(_val6),
val7(_val7),
val8(_val8),
val9(_val9),
val10(_val10),
val11(_val11),
val12(_val12),
val13(_val13),
val14(_val14),
val15(_val15) {}
// Only valid when instantiated for pointer types
__device__ void operator+=(nvfuser_index_t offset) {
TUPLE_INCREMENT_PTR(0);
TUPLE_INCREMENT_PTR(1);
TUPLE_INCREMENT_PTR(2);
TUPLE_INCREMENT_PTR(3);
TUPLE_INCREMENT_PTR(4);
TUPLE_INCREMENT_PTR(5);
TUPLE_INCREMENT_PTR(6);
TUPLE_INCREMENT_PTR(7);
TUPLE_INCREMENT_PTR(8);
TUPLE_INCREMENT_PTR(9);
TUPLE_INCREMENT_PTR(10);
TUPLE_INCREMENT_PTR(11);
TUPLE_INCREMENT_PTR(12);
TUPLE_INCREMENT_PTR(13);
TUPLE_INCREMENT_PTR(14);
TUPLE_INCREMENT_PTR(15);
}
};
#undef TUPLE_INCREMENT_PTR
// Accessor for Tuple
template <int idx>
struct get;
#define DEFINE_TUPLE_GET(idx) \
template <> \
struct get<idx> { \
template <typename Tuple> \
__device__ auto& operator()(Tuple& vals) { \
return vals.val##idx; \
} \
template <typename Tuple> \
__device__ const auto& operator()(const Tuple& vals) { \
return vals.val##idx; \
} \
};
DEFINE_TUPLE_GET(0);
DEFINE_TUPLE_GET(1);
DEFINE_TUPLE_GET(2);
DEFINE_TUPLE_GET(3);
DEFINE_TUPLE_GET(4);
DEFINE_TUPLE_GET(5);
DEFINE_TUPLE_GET(6);
DEFINE_TUPLE_GET(7);
DEFINE_TUPLE_GET(8);
DEFINE_TUPLE_GET(9);
DEFINE_TUPLE_GET(10);
DEFINE_TUPLE_GET(11);
DEFINE_TUPLE_GET(12);
DEFINE_TUPLE_GET(13);
DEFINE_TUPLE_GET(14);
DEFINE_TUPLE_GET(15);
#undef DEFINE_TUPLE_GET
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset = 0);
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset = 0);
template <typename DstType>
__inline__ __device__ static void setTuple(
DstType& dst,
typename DstType::template ValType<0> src);
template <typename... Types>
class LocalTuple {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
template <int idx>
using ValType = typename TypeSelector<idx, Types...>::type;
LocalTuple() = default;
__device__ explicit LocalTuple(Types... args) : vals_(args...) {}
__device__ LocalTuple(const LocalTuple& other) : vals_(other.vals_) {}
template <template <typename...> typename TupleType>
__device__ LocalTuple(const TupleType<Types...>& other) {
copyTuple(*this, other);
}
__device__ LocalTuple& operator=(const LocalTuple<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <template <typename...> typename TupleType>
__device__ LocalTuple& operator=(const TupleType<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <int val_idx>
__device__ auto& val(nvfuser_index_t ptr_offset = 0) {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
template <int val_idx>
__device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
private:
Tuple<Types...> vals_;
};
template <bool is_volatile, typename... Types>
class PtrTupleBase {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
template <int idx>
using ValType = typename TypeSelector<idx, Types...>::type;
template <int val_idx>
using TypeIMaybeVolatile = typename MaybeVolatile<
typename TypeSelector<val_idx, Types...>::type,
is_volatile>::type;
__device__ PtrTupleBase(Types*... args) : vals_(args...) {}
__device__ PtrTupleBase(const PtrTupleBase& other) : vals_(other.vals_) {}
// Note: this is a deep copy
__device__ PtrTupleBase& operator=(
const PtrTupleBase<is_volatile, Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <template <typename...> typename TupleType>
__device__ PtrTupleBase& operator=(const TupleType<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <int val_idx>
__device__ TypeIMaybeVolatile<val_idx>& val(nvfuser_index_t ptr_offset = 0) {
static_assert(val_idx < num_vals, "Out-of-range value index");
return ((TypeIMaybeVolatile<val_idx>*)get<val_idx>()(vals_))[ptr_offset];
}
template <int val_idx>
__device__ const TypeIMaybeVolatile<val_idx>& val(
nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return ((TypeIMaybeVolatile<val_idx>*)get<val_idx>()(vals_))[ptr_offset];
}
__device__ void operator+=(nvfuser_index_t ptr_offset) {
vals_ += ptr_offset;
}
private:
Tuple<Types*...> vals_;
};
template <typename... Types>
class RefTuple {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
template <int idx>
using ValType = typename TypeSelector<idx, Types...>::type;
__device__ RefTuple(Types&... args) : vals_(args...) {}
__device__ RefTuple(const RefTuple& other) : vals_(other.vals_) {}
template <template <typename...> typename TupleType>
__device__ RefTuple(const TupleType<Types...>& other) {
copyTuple(*this, other);
}
__device__ RefTuple& operator=(const RefTuple<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <template <typename...> typename TupleType>
__device__ RefTuple& operator=(const TupleType<Types...>& other) {
copyTuple(*this, other);
return *this;
}
template <int val_idx>
__device__ auto& val(nvfuser_index_t ptr_offset = 0) {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
template <int val_idx>
__device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
private:
Tuple<Types&...> vals_;
};
template <typename DstType, typename SrcType, int num_vals>
struct TupleCopy {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset) {
static_assert(
IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::
value,
"Invalid value types");
TupleCopy<DstType, SrcType, num_vals - 1>::copy(
dst, dst_offset, src, src_offset);
dst.val<num_vals - 1>(dst_offset) = src.val<num_vals - 1>(src_offset);
}
};
template <typename DstType, typename SrcType>
struct TupleCopy<DstType, SrcType, 0> {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset) {}
};
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset) {
static_assert(
IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::value,
"Invalid value types");
TupleCopy<DstType, SrcType, DstType::num_vals>::copy(
dst, dst_offset, src, src_offset);
};
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyTuple(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset) {
copyTuple<DstType, SrcType>(dst, 0, src, src_offset);
};
template <typename DstType, int num_vals>
struct TupleSet {
__inline__ __device__ static void set(
DstType& dst,
nvfuser_index_t dst_offset,
typename DstType::template ValType<0> src) {
static_assert(
IsSameType<
typename DstType::template ValType<num_vals - 1>,
typename DstType::template ValType<0>>::value,
"Invalid value types");
TupleSet<DstType, num_vals - 1>::set(dst, dst_offset, src);
dst.val<num_vals - 1>(dst_offset) = src;
}
};
template <typename DstType>
struct TupleSet<DstType, 0> {
__inline__ __device__ static void set(
DstType& dst,
nvfuser_index_t dst_offset,
typename DstType::template ValType<0> src) {}
};
template <typename DstType>
__inline__ __device__ static void setTuple(
DstType& dst,
nvfuser_index_t dst_offset,
typename DstType::template ValType<0> src) {
TupleSet<DstType, DstType::num_vals>::set(dst, dst_offset, src);
};
template <typename DstType>
__inline__ __device__ static void setTuple(
DstType& dst,
typename DstType::template ValType<0> src) {
setTuple(dst, 0, src);
};
template <typename DstType, typename SrcType, typename PredType, int num_vals>
struct PredicatedTupleCopy {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {
static_assert(
IsSameType<typename PredType::template ValType<num_vals - 1>, bool>::
value,
"Invalid predicate type");
PredicatedTupleCopy<DstType, SrcType, PredType, num_vals - 1>::copy(
dst, dst_offset, src, src_offset, pred);
if (pred.val<num_vals - 1>(0)) {
dst.val<num_vals - 1>(dst_offset) = src.val<num_vals - 1>(src_offset);
}
}
};
template <typename DstType, typename SrcType, typename PredType>
struct PredicatedTupleCopy<DstType, SrcType, PredType, 0> {
__inline__ __device__ static void copy(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {}
};
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyTupleIf(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {
static_assert(
IsSameType<typename DstType::ValTypes, typename SrcType::ValTypes>::value,
"Invalid value types");
static_assert(
PredType::num_vals == DstType::num_vals, "Invalid predicate type");
PredicatedTupleCopy<DstType, SrcType, PredType, DstType::num_vals>::copy(
dst, dst_offset, src, src_offset, pred);
};
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyTupleIf(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset,
const PredType& pred) {
copyTupleIf(dst, 0, src, src_offset, pred);
};
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyTupleIf(
DstType& dst,
const SrcType& src,
const PredType& pred) {
copyTupleIf(dst, 0, src, 0, pred);
};
// Can a generic const and non-const RefTupe be defined?
template <typename... Types>
class ConstRefTuple {
public:
static constexpr int num_vals = sizeof...(Types);
using ValTypes = TypeList<Types...>;
__device__ ConstRefTuple(const Types&... args) : vals_(args...) {}
__device__ ConstRefTuple(const ConstRefTuple& other) : vals_(other.vals_) {}
template <template <typename...> typename TupleType>
__device__ ConstRefTuple(const TupleType<Types...>& other) {
copyTuple(*this, other);
}
template <int val_idx>
__device__ const auto& val(nvfuser_index_t ptr_offset = 0) const {
static_assert(val_idx < num_vals, "Out-of-range value index");
return get<val_idx>()(vals_);
}
private:
Tuple<const Types&...> vals_;
};
template <typename... Types>
using PtrTuple = PtrTupleBase<false, Types...>;
template <typename... Types>
using VolatilePtrTuple = PtrTupleBase<true, Types...>;
// Define a LocalTuple of NumVals values of type Type
template <int NumVals, typename Type>
struct MakeLocalTuple;
template <typename Type>
struct MakeLocalTuple<1, Type> {
using type = LocalTuple<Type>;
};
template <typename Type>
struct MakeLocalTuple<2, Type> {
using type = LocalTuple<Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<3, Type> {
using type = LocalTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<4, Type> {
using type = LocalTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<5, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<6, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<7, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<8, Type> {
using type = LocalTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeLocalTuple<16, Type> {
using type = LocalTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
template <int NumVals, typename Type>
struct MakeRefTuple;
template <typename Type>
struct MakeRefTuple<1, Type> {
using type = RefTuple<Type>;
};
template <typename Type>
struct MakeRefTuple<2, Type> {
using type = RefTuple<Type, Type>;
};
template <typename Type>
struct MakeRefTuple<3, Type> {
using type = RefTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<4, Type> {
using type = RefTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<5, Type> {
using type = RefTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<6, Type> {
using type = RefTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<7, Type> {
using type = RefTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<8, Type> {
using type = RefTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeRefTuple<16, Type> {
using type = RefTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
template <int NumVals, typename Type>
struct MakeConstRefTuple;
template <typename Type>
struct MakeConstRefTuple<1, Type> {
using type = ConstRefTuple<Type>;
};
template <typename Type>
struct MakeConstRefTuple<2, Type> {
using type = ConstRefTuple<Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<3, Type> {
using type = ConstRefTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<4, Type> {
using type = ConstRefTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<5, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<6, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<7, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<8, Type> {
using type = ConstRefTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeConstRefTuple<16, Type> {
using type = ConstRefTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
template <int NumVals, typename Type>
struct MakeVolatilePtrTuple;
template <typename Type>
struct MakeVolatilePtrTuple<1, Type> {
using type = VolatilePtrTuple<Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<2, Type> {
using type = VolatilePtrTuple<Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<3, Type> {
using type = VolatilePtrTuple<Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<4, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<5, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<6, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<7, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<8, Type> {
using type = VolatilePtrTuple<Type, Type, Type, Type, Type, Type, Type, Type>;
};
template <typename Type>
struct MakeVolatilePtrTuple<16, Type> {
using type = VolatilePtrTuple<
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type,
Type>;
};
// Utility definitions. Currently only used with LocalTuple
template <int idx, typename BinaryFunc, typename... DataTypes>
struct TupleBinaryOp {
static __inline__ __device__ void apply(
BinaryFunc func,
const LocalTuple<DataTypes...>& lhs,
const LocalTuple<DataTypes...>& rhs,
LocalTuple<DataTypes...>& result) {
TupleBinaryOp<idx - 1, BinaryFunc, DataTypes...>::apply(
func, lhs, rhs, result);
result.val<idx - 1>(0) = func(lhs.val<idx - 1>(0), rhs.val<idx - 1>(0));
}
};
template <typename BinaryFunc, typename... DataTypes>
struct TupleBinaryOp<0, BinaryFunc, DataTypes...> {
static __inline__ __device__ void apply(
BinaryFunc func,
const LocalTuple<DataTypes...>& lhs,
const LocalTuple<DataTypes...>& rhs,
LocalTuple<DataTypes...>& result) {}
};
template <typename BinaryFunc, typename... DataTypes>
__inline__ __device__ LocalTuple<DataTypes...> apply(
BinaryFunc func,
const LocalTuple<DataTypes...>& lhs,
const LocalTuple<DataTypes...>& rhs) {
LocalTuple<DataTypes...> result = lhs;
TupleBinaryOp<sizeof...(DataTypes), BinaryFunc, DataTypes...>::apply(
func, result, rhs, result);
return result;
}
template <typename... BoolTypes>
__inline__ __device__ LocalTuple<BoolTypes...> operator&&(
const LocalTuple<BoolTypes...>& lhs,
const LocalTuple<BoolTypes...>& rhs) {
return apply([](bool x, bool y) { return x && y; }, lhs, rhs);
}
template <typename... BoolTypes>
__inline__ __device__ LocalTuple<BoolTypes...> operator&&(
bool lhs,
const LocalTuple<BoolTypes...>& rhs) {
LocalTuple<BoolTypes...> lhs_tuple;
setTuple(lhs_tuple, lhs);
return lhs_tuple && rhs;
}
template <typename... BoolTypes>
__inline__ __device__ LocalTuple<BoolTypes...> operator&&(
const LocalTuple<BoolTypes...>& lhs,
bool rhs) {
LocalTuple<BoolTypes...> rhs_tuple;
setTuple(rhs_tuple, rhs);
return lhs && rhs_tuple;
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Basically just blockDim, but wrapped as a struct so that we have a mechanism
// to know at compile time that whether we are just using blockDim or some
// custom value. For a kernel without warp specialization, we just use blockDim,
// but for a kernel with warp specialization, we use a custom block_dim whose
// dimension are the dimensions of the compute warps.
struct DefaultBlockDim {
const uint32_t x, y, z;
__device__ DefaultBlockDim() : x(blockDim.x), y(blockDim.y), z(blockDim.z) {}
__device__ operator dim3() const {
return blockDim;
}
};
// Default block synchronization. Just use __barrier_sync
namespace block_sync {
__forceinline__ __device__ void init() {}
// Thread-block synchronization
template <bool aligned, typename BlockDimT>
__forceinline__ __device__ void sync(BlockDimT block_dim) {
if constexpr (aligned) {
__syncthreads();
} else if constexpr (std::is_same_v<BlockDimT, DefaultBlockDim>) {
__barrier_sync(0);
} else {
uint32_t num_threads = block_dim.x * block_dim.y * block_dim.z;
asm volatile("bar.sync 0, %0;" : : "r"(num_threads) : "memory");
}
}
} // namespace block_sync
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace grid_sync {
// Get the first bit in a 64 bit integer
#define FIRST_UINT64_BIT ((uint64_t)1 << (sizeof(uint64_t) * 8 - 1))
template <typename T>
__device__ T globalAsVolatile(volatile T& global_val) {
return global_val;
}
// A grid synchronization that can be called multiple times in a kernel assuming
// all the blocks fit on device at once. The semaphore is an integer semaphore
// assumed to be initialized to 0 before launching the kernel. The persistent
// option should be envoked if this sync will be called multiple times in one
// kernel (i.e. having a grid reduce within a loop). Having multiple grid syncs
// called once in the same kernel does not require persistent mode. Segment size
// is the number of blocks participating in the sync in the dimensions marked by
// [X,Y,Z]_BLOCK. The granularity of this sync are those dimensions. I.E.
// Marking X and Y but not Z means there should be Z semaphores of size X*Y.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool PERSISTENT,
bool Aligned,
typename BlockDimT>
__device__ void sync(
int64_t& semaphore,
const uint64_t& segment_size,
const bool last_block,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Finish all global memory transactions before synchronizing
__threadfence();
// Synchronize all threads in a block before synchronizing blocks
block_sync::sync<Aligned>(block_dim);
// Only allow linear_tid == 0 to participate in the synchronization
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
// Get increment value, only want a single block to have the large
// increment, doesn't really matter which one, the goal is to flip/flop the
// first bit of a uint64_t value, since our semaphores are actualy int64_t
// we will just reinterpret_cast it to act as a uint64_t
uint64_t semaphore_increment = 1;
// Makes the assumption that blocks are in increasing order, this is not
// guaranteed by CUDA but this is the current behavior, and unlikely to
// change.
if (last_block) {
semaphore_increment = FIRST_UINT64_BIT - (segment_size - 1);
}
uint64_t oldArrive =
atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), semaphore_increment);
// If for persistent kernels, lock all blocks until the semaphore has been
// reached. Make sure we access semaphore as a volatile address so we get
// the global memory updates.
unsigned int ns = 8;
while ((PERSISTENT || last_block) &&
((oldArrive ^ globalAsVolatile(semaphore)) & FIRST_UINT64_BIT) ==
0) {
// Put a sleep here so we have some breaks in probing the global
// semaphore, giving a better chance for other warps/blocks to catch up.
#if __CUDA_ARCH__ >= 700
// __nanosleep only available on compute capability 7.0 or higher
__nanosleep(ns); // avoids busy waiting
if (ns < 256) {
ns *= 2;
}
#endif
}
}
// Sync block to make sure all other threads are waiting on the sync
block_sync::sync<Aligned>(block_dim);
}
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool PERSISTENT,
bool Aligned,
typename BlockDimT>
__device__ void sync(
int64_t& semaphore,
const uint64_t& segment_size,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT, Aligned>(
semaphore,
segment_size,
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim),
block_dim);
}
// Grid sync that can be called multiple times in the same kernel without all
// blocks being resident on device. This allows grid sync to be called multiple
// times as long as it's not broadcasted on the parallel axis it was reduced on.
//
// n_entrances is how many times every block is expected to enter into this
// function. All blocks must enter n_entrances times. The last block is only
// allowed to proceed once all other blocks have entered n_entrance
// times.
//
// Note that this is not currently used by grid and welford reduction
// as they use a separate sync flag for each each grid sync call.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool Aligned,
typename BlockDimT>
__device__ void sync(
int64_t& semaphore,
const uint64_t& segment_size,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Finish all global memory transactions before synchronizing
__threadfence();
// Synchronize all threads in a block before synchronizing blocks
block_sync::sync<Aligned>(block_dim);
// Only allow linear_tid == 0 to participate in the synchronization
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
// Makes the assumption that blocks are in increasing order, this is not
// guaranteed by CUDA but this is the current behavior, and unlikely to
// change.
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
int64_t finished_val =
((int64_t)(index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(
gridDim) -
1)) *
((int64_t)n_entrances);
unsigned int ns = 8;
// Last block needs to wait for all other blocks to finish
while (globalAsVolatile(semaphore) < finished_val) {
#if __CUDA_ARCH__ >= 700
// __nanosleep only available on compute capability 7.0 or higher
__nanosleep(ns); // avoids busy waiting
if (ns < 256) {
ns *= 2;
}
#endif
}
} else {
auto old = atomicAdd(reinterpret_cast<uint64_t*>(&semaphore), 1);
}
}
// Sync block to make sure all other threads are waiting on the sync
block_sync::sync<Aligned>(block_dim);
}
// Non-blocking function to read the semaphore value in each calling thread
__device__ int64_t semaphoreFetch(int64_t* semaphore) {
int64_t state;
// NOTE: acquire/release operations require sm_70 or higher
// https://docs.nvidia.com/cuda/archive/12.3.0/parallel-thread-execution/index.html#scopes-and-applicability
asm volatile("ld.global.acquire.gpu.b64 %0, [%1];\n"
: "=l"(state)
: "l"(semaphore));
return state;
}
// Non-blocking function to set semaphore to new_value
__device__ void semaphoreRelease(int64_t* semaphore, int64_t new_value) {
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
// NOTE: acquire/release operations require sm_70 or higher
// https://docs.nvidia.com/cuda/archive/12.3.0/parallel-thread-execution/index.html#scopes-and-applicability
asm volatile("st.global.release.gpu.b64 [%0], %1;\n"
:
: "l"(semaphore), "l"(new_value));
}
}
// First thread waits until fetched semaphore value matches trigger
__device__ void semaphoreWait(int64_t* semaphore, int64_t trigger_value) {
int64_t status = -1;
// Cutlass uses a loop like this, and has a facility where any thread can
// fetch the semaphore value ahead of waiting. This could reduce the wait
// time potentially but requires placement of the early fetch.
// https://github.com/NVIDIA/cutlass/blob/main/include/cutlass/semaphore.h
// while (__syncthreads_and(status != trigger_value)) {
// As soon as any thread in the block observes the trigger then it is
// safe to proceed
// Instead, we simply use the first thread in the block to do busy waiting.
if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
while (status != trigger_value) {
status = semaphoreFetch(semaphore);
}
}
}
// Serialize blocks in segments indicated by the [XYZ]_BLOCK template arguments.
// This should be called at the beginning of the section to be serialized.
// Assumes semaphore is initialized to zero. This function always synchronizes
// the thread block.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__device__ void blockSerializeWait(int64_t* semaphore) {
int segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
int block_idx_in_segment =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (block_idx_in_segment > 0) {
semaphoreWait(semaphore, block_idx_in_segment);
}
__syncthreads();
}
// Serialize blocks in segments indicated by the [XYZ]_BLOCK template arguments.
// This should be called at the end of the section to be serialized.
// This function always cleans up the semaphore; i.e. the last block writes the
// value 0 to the semaphore when complete. This function always synchronizes
// the thread block.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__device__ void blockSerializeRelease(int64_t* semaphore) {
int segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
int block_idx_in_segment =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
bool last_block = block_idx_in_segment == segment_size - 1;
// Block until writes from all threads in this block are visible to all other
// blocks before releasing semaphore using thread 0.
//
// Consider this simple example using two blocks:
//
// 1. Block 1 acquires lock using blockSerializeWait
// 2. Block 1 writes values to tensor T3
// 3. Block 1 releases lock using blockSerializeRelease
// 4. Block 2 acquires lock using blockSerializeWait
// 5. Block 2 uses values in tensor T3 to compute new values and writes them
// back to T3.
// 6. Block 2 releases lock using blockSerializeRelease
//
// Without a global thread fence, the writes to T3 from Block 1 in step 2
// might not be visible to Block 2 at step 5, meaning Block 2 would compute
// an invalid update.
//
// We use __syncthreads also, which implies a __threadfence_block but that
// only guarantees that all writes are visible to threads _within the same
// block_, so the __threadfence is still needed.
__threadfence();
__syncthreads();
semaphoreRelease(semaphore, last_block ? 0 : block_idx_in_segment + 1);
}
} // namespace grid_sync
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Reference:
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#asynchronous-barrier
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier
// https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/copy_sm90_desc.hpp
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
namespace mbarrier {
__device__ inline void init(
uint32_t smem_barrier_ptr,
uint32_t thread_count = 1) {
asm volatile(
"mbarrier.init.shared.b64 [%0], %1;\n" ::"r"(smem_barrier_ptr),
"r"(thread_count));
}
__device__ inline void inval(uint32_t smem_barrier_ptr) {
asm volatile("mbarrier.inval.shared.b64 [%0];\n" ::"r"(smem_barrier_ptr));
}
__device__ inline uint64_t arrive(uint32_t smem_barrier_ptr) {
volatile uint64_t state;
asm volatile("mbarrier.arrive.shared.b64 %0, [%1];\n"
: "=l"(state)
: "r"(smem_barrier_ptr));
return state;
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
__device__ inline uint64_t arriveExpectTX(
uint32_t smem_barrier_ptr,
uint32_t tx_count) {
volatile uint64_t state;
asm volatile("mbarrier.arrive.expect_tx.shared.b64 %0, [%1], %2;\n"
: "=l"(state)
: "r"(smem_barrier_ptr), "r"(tx_count));
return state;
}
__device__ inline void arrive(uint32_t smem_barrier_ptr, uint32_t cta_id) {
asm volatile(
"{.reg .b32 remaddr32;\n"
"mapa.shared::cluster.u32 remaddr32, %0, %1;\n"
"mbarrier.arrive.shared::cluster.b64 _, [remaddr32];\n"
"}"
:
: "r"(smem_barrier_ptr), "r"(cta_id));
}
#endif
__device__ inline void wait(uint32_t smem_barrier_ptr, uint64_t state) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile(
"{\n"
".reg .pred complete;\n"
"waitLoop:\n"
"mbarrier.try_wait.shared.b64 complete, [%0], %1;\n"
"@!complete bra waitLoop;\n"
"}\n" ::"r"(smem_barrier_ptr),
"l"(state));
#else
asm volatile(
"{\n"
".reg .pred P1;\n"
"LAB_WAIT:\n"
"mbarrier.test_wait.shared.b64 P1, [%0], %1;\n"
"@P1 bra.uni DONE;\n"
"nanosleep.u32 20;\n"
"bra.uni LAB_WAIT;\n"
"DONE:\n"
"}\n" ::"r"(smem_barrier_ptr),
"l"(state));
#endif
}
__device__ inline void waitParity(uint32_t smem_barrier_ptr, uint32_t parity) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile(
"{\n"
".reg .pred complete;\n"
"waitLoop:\n"
"mbarrier.try_wait.parity.shared.b64 complete, [%0], %1;\n"
"@!complete bra waitLoop;\n"
"}\n" ::"r"(smem_barrier_ptr),
"r"(parity));
#else
asm volatile(
"{\n"
".reg .pred P1;\n"
"LAB_WAIT:\n"
"mbarrier.test_wait.parity.shared.b64 P1, [%0], %1;\n"
"@P1 bra.uni DONE;\n"
"nanosleep.u32 20;\n"
"bra.uni LAB_WAIT;\n"
"DONE:\n"
"}\n" ::"r"(smem_barrier_ptr),
"r"(parity));
#endif
}
} // namespace mbarrier
#endif // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block. If set to false the dimension doesn't
// participate in the reduction. We could start with warp reductions, then
// reduce the warps, this could save some shared memory, but could be slower in
// some instances.
//
// EXAMPLE USAGE:
// blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
// (output[output_index], inputs[input_index],
// [] __device__ (T& a, const T b) { a += b; });
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void blockReduce(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// If this thread will output a final result
bool should_write =
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(threadIdx);
// Size of the reduction segments
unsigned int reduction_size =
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);
// Index into the reduction segment
unsigned int reduction_tid =
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
threadIdx, block_dim);
// Index of the reduction segment
unsigned int reduction_idx =
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
threadIdx, block_dim);
// number of reductions per block
unsigned int reduction_num =
index_utils::maskedSize<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(block_dim);
// smem_offset is the offset into shared memory for the current thread.
// To ensure coalesced access to shared memory, we need to ensure
// each transaction is accessing a contiguous block of 128 bytes.
// For outer reduction where TIDy is in the reduction dimension and TIDx
// is in the iteration dimension and TIDz is not used. We have
// reduction_tid = TIDy and reduction_idx = TIDx. If we directly use the
// offset based on reduction_tid and reduction_idx, we will have stride
// access to shared memory. For example:
// offset = reduction_idx * reduction_size + reduction_tid
// = TIDx * blockDim.y + TIDy
// To avoid this, we should always use the offset based on the indexing of
// threads within a block.
// Offset into smem for the current thread
unsigned int smem_offset = threadIdx.x + threadIdx.y * block_dim.x +
threadIdx.z * block_dim.x * block_dim.y;
// The peer stride represents the distance between the current element and its
// nearest reduction peer. It depends on the reduction dimension. A reduction
// peer refers to elements that belong to the same reduction segment. For
// example, if the reduction is across TIDy, all the elements in the same
// column (with the same TIDx) are considered peers of each other. The
// distance between an element and its nearest peer is block_dim.x.
constexpr int num_redu_dims = (int)X_REDUCE + (int)Y_REDUCE + (int)Z_REDUCE;
constexpr bool xz_reduce = (num_redu_dims == 2 && !Y_REDUCE);
// reduction in 3 dimensions, XYZ, stride is 1
unsigned int peer_stride = 1;
if (num_redu_dims == 1) {
// Reduction only in 1 dimension, X or Y or Z
// e.g. inner or outer reduction
// If X_REDUCE, reducing in neighbor cols in smem, peer_stride is 1
// If Y_REDUCE, reducing in neighbor rows in smem, peer_stride is
// block_dim.x If Z_REDUCE, reducing in neighbor planes in smem, peer_stride
// is block_dim.x * block_dim.y
peer_stride = X_REDUCE ? 1
: Y_REDUCE ? block_dim.x
: block_dim.x * block_dim.y;
} else if (num_redu_dims == 2) {
// Reduction in 2 dimensions, only one dimension is not reduced, !X, !Y, !Z
// If !Z_REDUCE, merge XY, reducing neighbor cols, peer_stride is 1
// If !X_REDUCE, merge ZY, reducing neighbor rows, peer_stride is
// block_dim.x If !Y_REDUCE, if block_dim.y == 1, merge XZ, peer_stride
// is 1. otherwise, needs carefully calculate offset to the reduction peer:
// (1) redu_offset = reduction_tid + tree_fold_factor
// (2) idz = redu_offset / block_dim.x
// (3) idx = redu_offset % block_dim.x
// (4) smem_offset = idx + threadIdx.y * block_dim.x + idz * block_dim.x *
// block_dim.y
if (!Y_REDUCE) {
peer_stride = 1;
} else {
peer_stride = !Z_REDUCE ? 1 : block_dim.x;
}
}
// Initialize shared memory
if (read_pred) {
shared_mem[smem_offset] = inp_val;
} else {
shared_mem[smem_offset] = init_val;
}
block_sync::sync<Aligned>(block_dim);
// Reduce down to nearest power of 2 for the tree reduction:
int np2 = 1 << (31 - __clz(reduction_size));
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
int peer_offset = smem_offset + np2 * peer_stride;
if constexpr (xz_reduce) {
if (block_dim.y > 1) {
int redu_offset = reduction_tid + np2;
int idz = redu_offset / block_dim.x;
int idx = redu_offset % block_dim.x;
peer_offset =
idx + threadIdx.y * block_dim.x + idz * block_dim.x * block_dim.y;
}
}
reduction_op(shared_mem[smem_offset], shared_mem[peer_offset]);
}
block_sync::sync<Aligned>(block_dim);
// loop peel the final iteration to save one syncthread for the end
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (reduction_tid < factor) {
int peer_offset = smem_offset + factor * peer_stride;
if constexpr (xz_reduce) {
if (block_dim.y > 1) {
int redu_offset = reduction_tid + factor;
int idz = redu_offset / block_dim.x;
int idx = redu_offset % block_dim.x;
peer_offset =
idx + threadIdx.y * block_dim.x + idz * block_dim.x * block_dim.y;
}
}
reduction_op(shared_mem[smem_offset], shared_mem[peer_offset]);
}
block_sync::sync<Aligned>(block_dim);
}
if (should_write && write_pred) {
T result = out;
reduction_op(result, shared_mem[smem_offset]);
if (reduction_size > 1) {
reduction_op(result, shared_mem[smem_offset + peer_stride]);
}
out = result;
}
block_sync::sync<Aligned>(block_dim);
}
// Use the same pred for both reads and writes
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void blockReduce(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
blockReduce<X_REDUCE, Y_REDUCE, Z_REDUCE, Aligned, T, Func>(
out,
inp_val,
reduction_op,
shared_mem,
read_write_pred,
read_write_pred,
init_val,
block_dim);
}
// Each thread in the iteration dimension processes N elements
// Typical usage is in outer reduction where the iteration dimension
// is parallelized by vectorized loads, bidmx. The reduction dimension
// is parallelized by bdimy. This function works as follows:
// (1) Each thread vectorized loads N elements from input register array to
// smem. (2) do N * bdimx parallel reductions in smem.
template <
bool Aligned,
int N, // Number of elements per input array
typename T,
typename Func,
typename BlockDimT>
__device__ void blockIterGroupedYdimReduce(
T out[N],
const T inp_val[N],
Func reduction_op,
T* shared_mem,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// N should be a valid vectorization factor
static_assert(
N == 2 || N == 4 || N == 8 || N == 16,
"N should be a valid vectorization factor, one of (2, 4, 8, 16)!");
bool should_write = threadIdx.y == 0;
unsigned int reduction_size = block_dim.y;
unsigned int reduction_tid = threadIdx.y;
// In shared memory, each row has 128 bytes, if sizeof(T) * N = 32 bytes, each
// row has 128 / 32 = 4 threads. Each transaction can only load data from one
// row, with a max of 16 bytes per thread. So the total bytes per transaction
// is 4 x 16 = 64 bytes which is only half of the maximum 128 bytes per
// transaction. we should change the layout from [TIDy, TIDx, N] to [N/4,
// TIDy, TIDx, 4]
constexpr unsigned int array_bytes = sizeof(T) * N;
constexpr unsigned int total_loads =
array_bytes / 16 > 1 ? array_bytes / 16 : 1;
constexpr unsigned int elements_per_load =
16 / sizeof(T) > N ? N : 16 / sizeof(T);
constexpr unsigned int align_size = array_bytes > 16 ? 16 : array_bytes;
// assume TIDy is the reduction dimension, TIDx is the iteration dimension
// TIDz is not used
unsigned int peer_stride = elements_per_load * block_dim.x;
unsigned int smem_offset_inter =
block_dim.x * block_dim.y * elements_per_load;
unsigned int smem_offset_intra =
(threadIdx.y * block_dim.x + threadIdx.x) * elements_per_load;
// load to [total_loads] sections of shared memory
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
loadGeneric<T, elements_per_load>(
shared_mem + smem_offset_inter * i + smem_offset_intra,
const_cast<T*>(inp_val) + i * elements_per_load);
}
block_sync::sync<Aligned>(block_dim);
// Reduce down to nearest power of 2 for the tree reduction:
// Perform parallel reduction for each element in the array
int np2 = 1 << (31 - __clz(reduction_size));
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
// vectorized load from smem to regs
__align__(align_size) T self[N];
__align__(align_size) T peer[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
int peer_offset = self_offset + np2 * peer_stride;
loadGeneric<T, elements_per_load>(
self + i * elements_per_load, shared_mem + self_offset);
loadGeneric<T, elements_per_load>(
peer + i * elements_per_load, shared_mem + peer_offset);
}
// reduction
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(self[i], peer[i]);
}
// write self back to smem
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
loadGeneric<T, elements_per_load>(
shared_mem + self_offset, self + i * elements_per_load);
}
}
block_sync::sync<Aligned>(block_dim);
// Tree reduction
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (reduction_tid < factor) {
// vectorized load from smem to regs
__align__(align_size) T self[N];
__align__(align_size) T peer[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
int peer_offset = self_offset + factor * peer_stride;
loadGeneric<T, elements_per_load>(
self + i * elements_per_load, shared_mem + self_offset);
loadGeneric<T, elements_per_load>(
peer + i * elements_per_load, shared_mem + peer_offset);
}
// reduction
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(self[i], peer[i]);
}
// write self back to smem
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
loadGeneric<T, elements_per_load>(
shared_mem + self_offset, self + i * elements_per_load);
}
}
block_sync::sync<Aligned>(block_dim);
}
// last reduction
if (should_write && write_pred) {
// init result
__align__(align_size) T result[N];
#pragma unroll
for (int i = 0; i < N; ++i) {
result[i] = out[i];
}
// copy first element to result
__align__(align_size) T self[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int self_offset = smem_offset_inter * i + smem_offset_intra;
loadGeneric<T, elements_per_load>(
self + i * elements_per_load, shared_mem + self_offset);
}
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(result[i], self[i]);
}
// reduction of the 2nd last element
if (reduction_size > 1) {
__align__(align_size) T peer[N];
#pragma unroll
for (unsigned int i = 0; i < total_loads; ++i) {
int peer_offset =
smem_offset_inter * i + smem_offset_intra + peer_stride;
loadGeneric<T, elements_per_load>(
peer + i * elements_per_load, shared_mem + peer_offset);
}
#pragma unroll
for (int i = 0; i < N; ++i) {
reduction_op(result[i], peer[i]);
}
}
#pragma unroll
for (int i = 0; i < N; ++i) {
out[i] = result[i];
}
}
block_sync::sync<Aligned>(block_dim);
}
// Use the same pred for both reads and writes
template <
bool Aligned,
int N, // Number of elements per input array
typename T,
typename Func,
typename BlockDimT>
__device__ void blockIterGroupedYdimReduce(
T out[N],
const T inp_val[N],
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
blockIterGroupedYdimReduce<Aligned, N, T, Func>(
out,
inp_val,
reduction_op,
shared_mem,
read_write_pred,
read_write_pred,
init_val,
block_dim);
}
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Inter-block reduction.
//
// The gridReduce function performs point-wise reductions of scalars across
// thread blocks. Thread blocks are disjointly partitioned into groups,
// "reduction segments", that are collectively defined by boolean template
// parameters, X_BLOCK, Y_BLOCK and Z_BLOCK. Each of X/Y/Z_BLOCK determines
// whether thread blocks along the dimension should be grouped into the same
// reduction segment. Cross-block reducitons are independently done within each
// segment and generates distinctive results per segment. For instance, if all
// of X/Y/Z_BLOCK are true, reductions will be done across all thread blocks
// since there will be just a single segment consisting of all thread blocks. If
// none of them are true, each thread block will become a segment by itself, so
// no reduction will be performed.
//
// The input scalars to reduce within each segment are a certain subset of
// thread-private scalars provided as part of the gridReduce function
// parameters. Boolean template parameters, X_THREAD, Y_THREAD and Z_THREAD,
// determine which subset of the scalars should be used for inter-block
// reductions. Specifically, all the input scalars of threads along each
// dimension will be used when X/Y/Z_THREAD are true. Otherwise, only the value
// held at offset 0 of each dimension will be used. Thus, for example, if all of
// X/Y/Z_THREAD are true, the scalars of all threads in each block will
// participate in inter-block reductions. If all of them are false, only one
// scalar of the thread at threadIdx.x == threadIdx.y == threadIdx.z == 0 will
// be used. In the code below, we call the subset of threads a "reduction
// block". "Participating" thread dimensions here are similar to the
// "non-participating" block dimensions. They come from a block dimension that
// has not been reduced before hitting this grid reduction.
//
// Inter-block reductions perform point-wise reductions of scalars of reduction
// blocks within each reduction segment. More specifically, let rb be a
// reduction block and rs be a reduction segment. Let IN(thread_idx, block_idx)
// denote the input scalar of thread at thread_idx and block_idx. The result of
// each reduction segment, OUT(thread_idx, block_idx_out), is defined only for
// each thread_idx in thread block block_idx_out in the segment as follows:
//
// OUT(thread_idx, block_idx_out) =
// Reduction of IN(thread_idx, block_idx) for
// all block_idx in a reduction segment
//
// OUT is not given for all threads that are not in block_idx_out and the
// reduction block.
//
// See also the function comment of gridReduce.
namespace reduction {
// Reduces all the reduction blocks in each reduction segment. This is the
// "cleanup" stage of a grid reduction.
//
// This is only called by one thread block per reduction segment. The input
// reduction blocks of the segment are stored in an intermediate buffer pointed
// by parameter in. Template parameters X/Y/Z_THREAD denote how the reduction
// block is formed.
//
// The size of a reduction block is by definition smaller or equal to the size
// of a thread block. We use the remaining threads to parallelize reductions
// across reduction blocks. For example, when X/Y/Z_THREAD = {true, false,
// false}, we use blockDim.y*blockDim.z threads for each output value. This is
// done first by loading the input values in parallel and then by reducing
// across threads of dimensions whose XYZ_THREAD are false.
//
// Note that what is done here after the loading from global memory is similar
// to what the existing blockReduce function does.
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduceLastBlock(
T& out,
const volatile T* in,
const nvfuser_index_t
grid_reduction_segment_size, // Number of reductions across
// grid reduce dimensions
const nvfuser_index_t
block_reduction_segment_size, // Number of reductions across the block
Func reduction_op,
T* shared_buf,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// We have to do num_reductions across reduction_size. The reductions are
// contiguous, but offset by reduction_size. There is an entry in "in" for
// every block, and every thread marked as true. Threads in dimensions marked
// as false can be used to parallelize the reduction.
// Find the reduction id of the participating threads
const auto block_reduction_segment_idx =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Find an id associated within a reduction segment for all
// "non-participating" threads, which will parallelize the reductions for the
// "participating" threads
const auto id_in_block_segment =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// Stride by the "non-participating" threads
const auto input_stride_for_thread_in_segment =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
T inp = init_val;
// Block stride across the reduction until we only have one value per thread
for (nvfuser_index_t reduction_i = id_in_block_segment;
reduction_i < grid_reduction_segment_size;
reduction_i += input_stride_for_thread_in_segment) {
auto work_buf_offset = reduction_i * block_reduction_segment_size +
block_reduction_segment_idx;
reduction_op(inp, in[work_buf_offset]);
}
// Block reduce the per thread values into per "participating" thread values
T inp_tmp = init_val;
blockReduce<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
inp_tmp, inp, reduction_op, shared_buf, true, init_val, block_dim);
const bool should_write = (X_THREAD || threadIdx.x == 0) &&
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
if (should_write && write_pred) {
reduction_op(out, inp_tmp);
}
}
// Reduces per-thread values across threads and thread blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - reduction_op: Scalar reduction function
// - work_buf: Temporary buffer for cross-block reductions
// - sync_flags: A vector of integers for synchronizations
// - shared_buf: Shared memory buffer for intra-block reduction
//
// Thread has valid results based on if it's the last block in the grid
// reduction dimension
//
// Template parameters:
// - X/Y/Z_BLOCK/THREAD: When true, reduces across thread blocks along the X/Y/Z
// dimensions
// - PERSISTENT_REDUCTION: Indicates grid reduction will be called in a loop, or
// the result of the grid reduction will be broadcasted and used across the
// grid. These requires cross grid communication and the grid synchronizations
// here to actually synchronize across the entire grid. When false the grid is
// not synchronized, the last block just waits for everyone else to finish and
// the other blocks can exit early.
// - T: Scalar data type of input/output data
// - Func: Type of scalara reduction function
//
// Template parameters X/Y/Z_BLOCK define a group of thread blocks that are
// reduced together. We call it a reduction segment. Some examples are:
//
// Case 1: X/Y/Z_BLOCK == true/true/true -> There is only one segment, which
// includes all thread blocks. It is effecively the same as the grid.
//
// Case 2: X/Y/Z_BLOCK == false/false/false -> Each thread block comprises an
// individual segment by itself.
//
// Case 3: X/Y/Z_BLOCK == true/false/false -> Each segment contains thread
// blocks that have the same blockDim.x. There will be blockDim.y*blockDim.z
// such segments.
//
// X/Y/Z_THREAD also works similarly as X/Y/Z_BLOCK and defines a
// group of threads that are reduced togather.
//
// After the function completes, only one thread block per reduction segment
// gets valid reduction results. There is no guarantee which particular block
// gets the final results.
//
// entrance_ind and n_entrances are allowed when PERSISTENT_REDUCTION = false.
// If a grid reduction call is only called once per thread, entrance_ind == 0
// and n_entrances == 1. However, grid reduction can be called in a loop in a
// thread, in that case entrance_ind is the count of times the function has been
// called, and n_entrances is the total number of times it will be called.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduce(
T& out,
const T& inp_val,
Func reduction_op,
volatile T* work_buf,
int64_t* sync_flags,
T* shared_buf,
bool read_pred,
bool write_pred,
T init_val,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
T block_reduction_val = init_val;
// Do block reduction when required
if (X_THREAD || Y_THREAD || Z_THREAD) {
blockReduce<X_THREAD, Y_THREAD, Z_THREAD, Aligned>(
block_reduction_val,
inp_val,
reduction_op,
shared_buf,
read_pred,
true,
init_val,
block_dim);
} else if (read_pred) {
block_reduction_val = inp_val;
}
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads we can use in final reduction, Seems to assume all
// threads in the block participate
const auto block_reduction_segment_size =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
work_buf += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
auto work_buf_offset =
block_offset * block_reduction_segment_size + thread_offset;
work_buf[work_buf_offset] = block_reduction_val;
}
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
// Use a different sync flag for each call
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[entrance_ind * grid_segment_size + idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// Cleanup with block reduction
gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
out,
(T*)work_buf,
grid_reduction_segment_size,
block_reduction_segment_size,
reduction_op,
shared_buf,
write_pred,
init_val,
block_dim);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
// This is just a wrapper of the above grid reduction routine to
// measure the elapsed cycles. The measurement must be done just by
// one thread, and in this case it should be done by one of the
// threads in the last thread block.
#ifdef NVFUSER_PROFILE_KERNEL
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduce(
T& out,
const T& inp_val,
Func reduction_op,
volatile T* work_buf,
int64_t* sync_flags,
T* shared_buf,
bool read_pred,
bool write_pred,
T init_val,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
gridReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
Aligned,
T,
Func>(
out,
inp_val,
reduction_op,
work_buf,
sync_flags,
shared_buf,
read_pred,
write_pred,
init_val,
entrance_ind,
n_entrances,
block_dim);
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
#endif // NVFUSER_PROFILE_KERNEL
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void gridReduce2PartialReduction(
const T& inp_val,
T init_val,
Func reduction_op,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
volatile T* work_buf,
T* shared_buf,
bool read_pred,
nvfuser_index_t grid_reduction_segment_size,
nvfuser_index_t idx_in_grid_segment,
nvfuser_index_t block_reduction_segment_size) {
T block_reduction_val = init_val;
// Do block reduction when required
if (X_THREAD || Y_THREAD || Z_THREAD) {
blockReduce<X_THREAD, Y_THREAD, Z_THREAD, Aligned>(
block_reduction_val,
inp_val,
reduction_op,
shared_buf,
read_pred,
true,
init_val,
block_dim);
} else if (read_pred) {
block_reduction_val = inp_val;
}
if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
auto work_buf_offset =
block_offset * block_reduction_segment_size + thread_offset;
work_buf[work_buf_offset] = block_reduction_val;
}
}
// 2-way horizontally fused grid reduction
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T1,
typename Func1,
typename T2,
typename Func2,
typename BlockDimT>
__device__ void gridReduceGroup(
T1& out1,
const T1& inp_val1,
T1 init_val1,
Func1 reduction_op1,
volatile T1* work_buf1,
T2& out2,
const T2& inp_val2,
T2 init_val2,
Func2 reduction_op2,
volatile T2* work_buf2,
int64_t* sync_flags,
void* shared_buf,
bool read_pred,
bool write_pred,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads we can use in final reduction, Seems to assume all
// threads in the block participate
const auto block_reduction_segment_size =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
work_buf1 += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
work_buf2 += (entrance_ind * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
gridReduce2PartialReduction<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
Aligned>(
inp_val1,
init_val1,
reduction_op1,
block_dim,
work_buf1,
(T1*)shared_buf,
read_pred,
grid_reduction_segment_size,
idx_in_grid_segment,
block_reduction_segment_size);
gridReduce2PartialReduction<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
Aligned>(
inp_val2,
init_val2,
reduction_op2,
block_dim,
work_buf2,
(T2*)shared_buf,
read_pred,
grid_reduction_segment_size,
idx_in_grid_segment,
block_reduction_segment_size);
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[entrance_ind * grid_segment_size + idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// Cleanup with block reduction
gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
out1,
work_buf1,
grid_reduction_segment_size,
block_reduction_segment_size,
reduction_op1,
(T1*)shared_buf,
write_pred,
init_val1,
block_dim);
gridReduceLastBlock<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
out2,
work_buf2,
grid_reduction_segment_size,
block_reduction_segment_size,
reduction_op2,
(T2*)shared_buf,
write_pred,
init_val2,
block_dim);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
#ifdef NVFUSER_PROFILE_KERNEL
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T1,
typename Func1,
typename T2,
typename Func2,
typename BlockDimT>
__device__ void gridReduceGroup(
T1& out1,
const T1& inp_val1,
T1 init_val1,
Func1 reduction_op1,
volatile T1* work_buf1,
T2& out2,
const T2& inp_val2,
T2 init_val2,
Func2 reduction_op2,
volatile T2* work_buf2,
int64_t* sync_flags,
void* shared_buf,
bool read_pred,
bool write_pred,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
gridReduceGroup<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
Aligned,
T1,
Func1,
T2,
Func2>(
out1,
inp_val1,
init_val1,
reduction_op1,
work_buf1,
out2,
inp_val2,
init_val2,
reduction_op2,
work_buf2,
sync_flags,
shared_buf,
read_pred,
write_pred,
entrance_ind,
n_entrances,
block_dim);
if (index_utils::maskedIsLast<true, true, true>(blockIdx, gridDim) &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
#endif // NVFUSER_PROFILE_KERNEL
// This performs a single reduction step, combining a single element "in" with
// a previous value "work". For a serial grid reduction, "work" resides in
// global memory, while "in" and "out" are in registers.
//
// If the write predicate is false, this function returns early (noop). If the
// read predicate is false, "init" is used in place of "in".
//
// If first_step is false, "work" will be read and reduction_op will be called.
// The result will be written back to "work" unless last_step is true.
template <int64_t vec_size, typename T, typename Func>
__device__ void serialReductionStep(
T* out,
T* in,
T init,
volatile T* work,
Func reduction_op,
bool first_step,
bool last_step,
bool read_pred,
bool write_pred) {
if (!write_pred) {
return;
}
if (read_pred) {
loadGeneric<T, vec_size>(out, in);
} else {
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
out[i] = init;
}
}
if (!first_step) {
T work_reg[vec_size];
loadGlobalToLocal<T, vec_size, true, CacheOp::Global>(work_reg, work);
#pragma unroll
for (int i = 0; i < vec_size; ++i) {
reduction_op(out[i], work_reg[i]);
}
}
if (!last_step) {
loadLocalToGlobal<T, vec_size, true>(work, out);
}
}
// check required transactions based on data type and vectorization factor
// ensure each thread in each transaction has no more than 16 bytes which
// is the maximum allowed vectorization width.
template <typename T, int vec_size>
constexpr __device__ int getTransactions() {
constexpr int total_bytes = vec_size * sizeof(T);
return total_bytes <= 16 ? 1 : total_bytes / 16;
}
template <typename T, int vec_size>
constexpr __device__ int getElementsPerTransaction() {
return vec_size * sizeof(T) <= 16 ? vec_size : 16 / sizeof(T);
}
// calculate elements per section
__inline__ __device__ nvfuser_index_t getElementsPerSection(
nvfuser_index_t row_len,
nvfuser_index_t col_len,
nvfuser_index_t elements_per_thread) {
return row_len * col_len * elements_per_thread;
}
// calculate offset within a section
__inline__ __device__ nvfuser_index_t getOffsetWithinSection(
nvfuser_index_t row_len,
nvfuser_index_t row_id,
nvfuser_index_t col_id,
nvfuser_index_t elements_per_thread) {
return (row_id * row_len + col_id) * elements_per_thread;
}
// vectorized reduction
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
int vec_size,
typename T,
typename Func,
typename BlockDimT>
__device__ void iterGroupedGridReduceLastBlock(
T* out,
const volatile T* in,
const nvfuser_index_t
grid_reduction_segment_size, // Number of reductions across
// grid reduce dimensions
const nvfuser_index_t
block_segment_size, // Number of reductions across the block
Func reduction_op,
T* shared_buf,
bool write_pred,
T init_val,
const nvfuser_index_t grid_segment_size,
const nvfuser_index_t idx_in_grid_segment,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// We have to do num_reductions across reduction_size. The reductions are
// contiguous, but offset by reduction_size. There is an entry in "in" for
// every block, and every thread marked as true. Threads in dimensions marked
// as false can be used to parallelize the reduction.
// Find the reduction id of the participating threads
const auto block_reduction_segment_idx =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Find an id associated within a reduction segment for all
// "non-participating" threads, which will parallelize the reductions for the
// "participating" threads
const auto id_in_block_segment =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// index into iteration dim.
// Its calculation is same to that in [iterGroupedGridReduce]. Becuase when
// [iterGroupedGridReduceLastBlock] is called from [iterGroupedGridReduce],
// X_THREAD, Y_THREAD, Z_THREAD are flipped.
const auto thread_offset =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Stride by the "non-participating" threads
const auto input_stride_for_thread_in_segment =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
constexpr unsigned int max_align_bytes = 16;
constexpr unsigned int vec_bytes = sizeof(T) * vec_size;
constexpr unsigned int align_bytes =
vec_bytes > max_align_bytes ? max_align_bytes : vec_bytes;
// Ensure alignment for vectorized load/store to smem in grouped block
// reduction
__align__(align_bytes) T inp[vec_size];
#pragma unroll
for (int i = 0; i < vec_size; i++) {
inp[i] = init_val;
}
// Max vectorized load/store size is 16 bytes, if each thread has more than
// 16 bytes, split into multiple sections to ensure each thread occupies only
// 16 bytes at most. For example, if each thread has 8 fp32 which occupies 32
// bytes, split into 2 sections, in each secdtion each thread holds 4 fp32 or
// 16 bytes. Thread-0 processes elements [0,7], the first 4 elements [0,3] are
// stored in the first section and the last 4 elements [4,7] are stored in the
// 2nd section. The data layout in gmem is:
// |-----------section 1-----------|-----------section 2-----------|
// TIDx: |000|001|002|003|004|005|006|007|000|001|002|003|004|005|006|007|
// GMEM: |000|016|032|048|064|080|096|112|128|144|160|176|192|208|224|240|
// Element:|000|008|016|024|032|040|048|056|004|012|020|028|036|044|052|060|
// This layout ensures coalesced access to gmem and each transaction loads 128
// bytes.
constexpr auto n_transactions = getTransactions<T, vec_size>();
constexpr auto n_elements_per_transaction =
getElementsPerTransaction<T, vec_size>();
const auto elements_per_section = getElementsPerSection(
block_segment_size * grid_segment_size, // row len
grid_reduction_segment_size, // col len
n_elements_per_transaction);
// Block stride across the reduction until we only have one value per thread
for (nvfuser_index_t reduction_i = id_in_block_segment;
reduction_i < grid_reduction_segment_size;
reduction_i += input_stride_for_thread_in_segment) {
auto offset_in_section = getOffsetWithinSection(
block_segment_size * grid_segment_size, // row len
reduction_i, // row id
block_segment_size * idx_in_grid_segment + thread_offset, // col id
n_elements_per_transaction);
#pragma unroll
for (auto i = 0; i < n_transactions; i++) {
auto i_offset = i * n_elements_per_transaction;
T in_reg[n_elements_per_transaction];
loadGlobalToLocal<T, n_elements_per_transaction, true, CacheOp::Global>(
&in_reg[0],
const_cast<T*>(in + elements_per_section * i + offset_in_section));
#pragma unroll
for (auto j = 0; j < n_elements_per_transaction; j++) {
reduction_op(inp[i_offset + j], in_reg[j]);
}
}
}
// Block reduce the per thread values into per "participating" thread values.
// inp_tmp stores output results, not being vectorized loaded to smem, no need
// to enforce alignment.
T inp_tmp[vec_size];
#pragma unroll
for (int i = 0; i < vec_size; i++) {
inp_tmp[i] = init_val;
}
blockIterGroupedYdimReduce<Aligned, vec_size>(
inp_tmp, inp, reduction_op, shared_buf, true, init_val, block_dim);
const bool should_write = (X_THREAD || threadIdx.x == 0) &&
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
if (should_write && write_pred) {
#pragma unroll
for (int i = 0; i < vec_size; i++) {
reduction_op(out[i], inp_tmp[i]);
}
}
}
// Main algorithm is same to gridReduce: start with block reduce then write
// results to gmem, the last block load from gmem and finalize with a block
// reduction. Main differences:
// (1) each thread in the iter dim does [vec_size] reductions instead of 1.
// (2) using [blockIterGroupedYdimReduce] instead of [blockReduce].
// (3) ensures vectorized load/store to gmem.
// Specifically, the new para [vec_size] is the vecotrization factor in the
// iteration dimension. It is used in outer reduction to reduce calling this
// grid reduction from [vec_size] times to only 1 time. Its value is limited
// to 1, 2, 4, 8, 16 based on the hardware support and input data type.
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
int vec_size,
typename T,
typename Func,
typename BlockDimT>
__device__ void iterGroupedGridReduce(
T* out,
const T* inp_val,
Func reduction_op,
volatile T* work_buf,
int64_t* sync_flags,
T* shared_buf,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// inp or block reduction results
T block_reduction_val[vec_size];
// Do block reduction when required
if (X_THREAD || Y_THREAD || Z_THREAD) {
#pragma unroll
for (int i = 0; i < vec_size; i++) {
block_reduction_val[i] = init_val;
}
blockIterGroupedYdimReduce<Aligned, vec_size>(
block_reduction_val,
inp_val,
reduction_op,
shared_buf,
read_pred,
true,
init_val,
block_dim);
} else if (read_pred) {
#pragma unroll
for (int i = 0; i < vec_size; i++) {
block_reduction_val[i] = inp_val[i];
}
}
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of reductions in each block
const auto block_segment_size =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
if ((!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// Max vectorized load/store size is 16 bytes, if each thread has more than
// 16 bytes, split into multiple sections to ensure each thread occupies
// only 16 bytes at most. For example, if each thread has 8 fp32 which
// occupies 32 bytes, split into 2 sections, in each secdtion each thread
// holds 4 fp32 or 16 bytes. Thread-0 processes elements [0,7], the first 4
// elements [0,3] are stored in the first section and the last 4 elements
// [4,7] are stored in the 2nd section. The data layout in gmem is:
// |-----------section 1-----------|-----------section 2-----------|
// TIDx: |000|001|002|003|004|005|006|007|000|001|002|003|004|005|006|007|
// GMEM: |000|016|032|048|064|080|096|112|128|144|160|176|192|208|224|240|
// Element:|000|008|016|024|032|040|048|056|004|012|020|028|036|044|052|060|
// This layout ensures coalesced access to gmem and each transaction loads
// 128 bytes.
constexpr auto n_transactions = getTransactions<T, vec_size>();
constexpr auto n_elements_per_transaction =
getElementsPerTransaction<T, vec_size>();
// get elements per section, used to offset between different sections
// number of elements in each thread: [n_elements_per_transaction]
// number of threads in each row: [block_segment_size] * [grid_segment_size]
// number of rows in each section: [grid_reduction_segment_size]
auto elements_per_section = getElementsPerSection(
block_segment_size * grid_segment_size, // row len
grid_reduction_segment_size, // col len
n_elements_per_transaction);
// index to the right position in [work_buf] to store block reduction
// results. Consider a typical outer reduction case where iteration dim is
// TIDx and BIDx and reduction dim is TIDy and BIDy. block_offset = BIDy
// block_segment_size = blockDim.x
// grid_segment_size = gridDim.x
// idx_in_grid_segment = BIDx
// thread_offset = TIDx
auto offset_in_section = getOffsetWithinSection(
block_segment_size * grid_segment_size, // row len
block_offset, // row id
block_segment_size * idx_in_grid_segment + thread_offset, // col id
n_elements_per_transaction);
#pragma unroll
for (int i = 0; i < n_transactions; i++) {
loadLocalToGlobal<T, n_elements_per_transaction, true>(
&work_buf[elements_per_section * i + offset_in_section],
&block_reduction_val[i * n_elements_per_transaction]);
}
}
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
// there is only one vectorized call
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// Cleanup with block reduction
iterGroupedGridReduceLastBlock<
!X_THREAD,
!Y_THREAD,
!Z_THREAD,
Aligned,
vec_size>(
out,
(T*)work_buf,
grid_reduction_segment_size,
block_segment_size,
reduction_op,
shared_buf,
write_pred,
init_val,
grid_segment_size,
idx_in_grid_segment,
block_dim);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
} // namespace reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace grid_broadcast {
// Broadcasts per-thread values across threads and blocks.
//
// Function parameters:
// - out: Per-thread output location
// - inp_val: Per-thread input value
// - work_buf: Temporary buffer for communication across threads/blocks
// - sync_flags: A vector of integers for synchronizations
//
// Template parameters:
// - X/Y/Z_BLOCK: When true, broadcasts across thread blocks along the X/Y/Z
// dimensions
// - X/Y/Z_THREAD: When true, broadcasts across threads along the X/Y/Z
// dimensions
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename BlockDimT>
__device__ void broadcast(
T& out,
const T& inp_val,
volatile T* work_buf,
Tensor<int64_t, 1> sync_flags,
bool read_write_pred,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Number of values broadcasted in the grid dimensions
const auto grid_seg_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the broadcast we're performing out of the grid_seg_size
const auto grid_seg_idx =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads not participating in a broadcast dimension, this is the
// number of thread entries to expect in the work buffer, therefore a striding
const auto block_stride =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
// Which broadcast in the block this is to line up the entry with the work
// buffer
const auto thread_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
const bool has_valid_data = (!X_BLOCK || blockIdx.x == gridDim.x - 1) &&
(!Y_BLOCK || blockIdx.y == gridDim.y - 1) &&
(!Z_BLOCK || blockIdx.z == gridDim.z - 1) &&
(!X_THREAD || threadIdx.x == 0) && (!Y_THREAD || threadIdx.y == 0) &&
(!Z_THREAD || threadIdx.z == 0);
if (has_valid_data && read_write_pred) {
work_buf[grid_seg_idx * block_stride + thread_offset] = inp_val;
__threadfence();
}
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true, Aligned>(
sync_flags[grid_seg_idx], grid_seg_size, block_dim);
if (read_write_pred) {
out = work_buf[grid_seg_idx * block_stride + thread_offset];
}
// Make sure everyone has read from the buffer before continuing the kernel
// and potentially overwriting
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, true, Aligned>(
sync_flags[grid_seg_idx], grid_seg_size, block_dim);
}
} // namespace grid_broadcast
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace broadcast {
// Broadcasts within partitioned groups of threads.
//
// X_THREAD: Broadcast from threadIdx.x == 0 if true
// Y_THREAD: Broadcast from threadIdx.y == 0 if true
// Z_THREAD: Broadcast from threadIdx.z == 0 if true
// Aligned: Called from aligned threads if true
// inp_val: Per-thread source value. Only valid when the thread is a source.
// out: Per-thread output location
//
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename BlockDimT>
__device__ void blockBroadcast(
T& out,
const T& inp_val,
T* shared_mem,
bool read_write_pred,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
const bool has_valid_data = (!X_THREAD || threadIdx.x == 0) &&
(!Y_THREAD || threadIdx.y == 0) && (!Z_THREAD || threadIdx.z == 0);
const auto shared_offset =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
if (has_valid_data && read_write_pred) {
shared_mem[shared_offset] = inp_val;
}
block_sync::sync<Aligned>(block_dim);
if (read_write_pred) {
out = shared_mem[shared_offset];
}
block_sync::sync<Aligned>(block_dim);
}
} // namespace broadcast
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
template <typename DataType>
struct WelfordTriplet {
DataType avg;
DataType var;
nvfuser_index_t N;
};
template <typename DataType>
__inline__ __device__ void copyTriplet(
DataType* dst_avg,
DataType* dst_var,
nvfuser_index_t* dst_N,
const WelfordTriplet<DataType>& src) {
*dst_avg = src.avg;
*dst_var = src.var;
*dst_N = src.N;
}
template <typename DataType>
__inline__ __device__ void copyTriplet(
WelfordTriplet<DataType>& dst,
const DataType* src_avg,
const DataType* src_var,
const nvfuser_index_t* src_N) {
dst.avg = *src_avg;
dst.var = *src_var;
dst.N = *src_N;
}
template <typename DataType>
__inline__ __device__ void copyTriplet(
WelfordTriplet<DataType>& dst,
const WelfordTriplet<DataType>& src) {
dst.avg = src.avg;
dst.var = src.var;
dst.N = src.N;
}
// -----------------------------------------------------------------------------------------------
// Block Welford Primitives
// -----------------------------------------------------------------------------------------------
// Basic utility for welford update. Can be used to scan one value, or two merge
// two welford results
template <typename T, typename TN>
__inline__ __device__ void welfordCombine(
T& a_avg,
T& a_M2,
TN& a_N,
const T b_avg,
const T b_M2,
TN b_N) {
if (b_N == 0) {
return;
}
TN ab_N = a_N + b_N;
T b_N_div_ab_N = ((T)(nvfuser_index_t)(b_N)) / ((T)(nvfuser_index_t)(ab_N));
T delta = b_avg - a_avg;
a_avg += delta * b_N_div_ab_N;
a_M2 += b_M2 + delta * delta * ((T)(nvfuser_index_t)(a_N)) * b_N_div_ab_N;
a_N = ab_N;
}
template <typename T, bool OutputGmem>
__inline__ __device__ void welfordVectorized(
T& a_avg,
T& a_M2,
nvfuser_index_t& a_N,
const T b_avg,
const T b_N_div_ab_N,
const nvfuser_index_t ab_N,
const bool pred) {
// Want only predicated statements and don't want to have
// "if", but for gmem output writes can be illegal, so needs to
// bail out here.
if (OutputGmem && !pred) {
return;
}
T predicated_b_avg = pred ? b_avg : a_avg;
T delta0 = predicated_b_avg - a_avg;
a_avg += delta0 * b_N_div_ab_N;
T delta1 = predicated_b_avg - a_avg;
a_M2 += delta0 * delta1;
a_N = ab_N;
}
// Non predicated version
template <typename T>
__inline__ __device__ void welfordVectorized(
T& a_avg,
T& a_M2,
nvfuser_index_t& a_N,
const T b_avg,
const T b_N_div_ab_N,
const nvfuser_index_t ab_N) {
T delta0 = b_avg - a_avg;
a_avg += delta0 * b_N_div_ab_N;
T delta1 = b_avg - a_avg;
a_M2 += delta0 * delta1;
a_N = ab_N;
}
// [Z,Y,X]_THREADS is the number of participating threads in the z, y, x
// dimension of the block.
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__inline__ __device__ void blockWelford(
T& out_avg,
T& out_M2,
TN& out_N,
const T& in_avg,
const T& in_M2,
const TN& in_N,
T* shared_mem_avg,
T* shared_mem_M2,
TN* shared_mem_N,
bool read_pred,
bool write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// If this thread will output a final result
bool should_write =
index_utils::maskedIsZero<X_REDUCE, Y_REDUCE, Z_REDUCE>(threadIdx);
// Size of the reduction segments
unsigned int reduction_size =
index_utils::maskedSize<X_REDUCE, Y_REDUCE, Z_REDUCE>(block_dim);
// Index into the reduction segment
unsigned int reduction_tid =
index_utils::maskedOffset<X_REDUCE, Y_REDUCE, Z_REDUCE>(
threadIdx, block_dim);
// Index of the reduction segment
unsigned int reduction_idx =
index_utils::maskedOffset<!X_REDUCE, !Y_REDUCE, !Z_REDUCE>(
threadIdx, block_dim);
// Offset into smem for the current thread
unsigned int smem_offset = reduction_idx * reduction_size + reduction_tid;
if (read_pred) {
shared_mem_avg[smem_offset] = in_avg;
shared_mem_M2[smem_offset] = in_M2;
shared_mem_N[smem_offset] = in_N;
} else {
shared_mem_avg[smem_offset] = init_val;
shared_mem_M2[smem_offset] = init_val;
shared_mem_N[smem_offset] = 0;
}
block_sync::sync<Aligned>(block_dim);
// Reduce down to nearest power of 2:
int np2 = 1 << (31 - __clz(reduction_size));
if (reduction_tid < np2 && reduction_tid + np2 < reduction_size) {
welfordCombine(
shared_mem_avg[smem_offset],
shared_mem_M2[smem_offset],
shared_mem_N[smem_offset],
shared_mem_avg[smem_offset + np2],
shared_mem_M2[smem_offset + np2],
shared_mem_N[smem_offset + np2]);
}
block_sync::sync<Aligned>(block_dim);
// loop peel the final iteration to save one syncthread for the end
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (reduction_tid < factor) {
welfordCombine(
shared_mem_avg[smem_offset],
shared_mem_M2[smem_offset],
shared_mem_N[smem_offset],
shared_mem_avg[smem_offset + factor],
shared_mem_M2[smem_offset + factor],
shared_mem_N[smem_offset + factor]);
}
block_sync::sync<Aligned>(block_dim);
}
if (should_write && write_pred) {
T res_avg = out_avg;
T res_M2 = out_M2;
TN res_N = out_N;
welfordCombine(
res_avg,
res_M2,
res_N,
shared_mem_avg[smem_offset],
shared_mem_M2[smem_offset],
shared_mem_N[smem_offset]);
if (reduction_size > 1) {
welfordCombine(
res_avg,
res_M2,
res_N,
shared_mem_avg[smem_offset + 1],
shared_mem_M2[smem_offset + 1],
shared_mem_N[smem_offset + 1]);
}
out_avg = res_avg;
out_M2 = res_M2;
out_N = res_N;
}
block_sync::sync<Aligned>(block_dim);
}
// Use the same pred for both reads and writes
template <
bool X_REDUCE,
bool Y_REDUCE,
bool Z_REDUCE,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__inline__ __device__ void blockWelford(
T& out_avg,
T& out_M2,
TN& out_N,
const T& in_avg,
const T& in_M2,
const TN& in_N,
T* shared_mem_avg,
T* shared_mem_M2,
TN* shared_mem_N,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
blockWelford<X_REDUCE, Y_REDUCE, Z_REDUCE, Aligned, T, TN>(
out_avg,
out_M2,
out_N,
in_avg,
in_M2,
in_N,
shared_mem_avg,
shared_mem_M2,
shared_mem_N,
read_write_pred,
read_write_pred,
init_val,
block_dim);
}
// -----------------------------------------------------------------------------------------------
// Grid Welford Prototype
// -----------------------------------------------------------------------------------------------
namespace welford {
template <
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__device__ void gridWelfordLastBlock(
T& out_avg,
T& out_M2,
TN& out_N,
const volatile T* in_avg,
const volatile T* in_M2,
const volatile TN* in_N,
const nvfuser_index_t
grid_reduction_segment_size, // Number of reductions across
// grid reduce dimensions
const nvfuser_index_t block_reduction_segment_size,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
T* shared_buf_avg,
T* shared_buf_M2,
TN* shared_buf_N,
bool write_pred,
T init_val) {
// We have to do num_reductions across reduction_size. The reductions are
// contiguous, but offset by reduction_size. There is an entry in "in" for
// every block, and every thread marked as true. Threads in dimensions marked
// as false can be used to parallelize the reduction.
// Find the reduction id of the participating threads
const auto block_reduction_segment_idx =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
// Find an id associated within a reduction segment for all
// "non-participating" threads, which will parallelize the reductions for the
// "participating" threads
const auto id_in_block_segment =
index_utils::maskedOffset<!X_THREAD, !Y_THREAD, !Z_THREAD>(
threadIdx, block_dim);
// Stride by the "non-participating" threads
const auto input_stride_for_thread_in_segment =
index_utils::maskedSize<!X_THREAD, !Y_THREAD, !Z_THREAD>(block_dim);
T inp_avg = init_val;
T inp_M2 = init_val;
TN inp_N = 0;
// Block stride across the reduction until we only have one value per thread
for (nvfuser_index_t reduction_i = id_in_block_segment;
reduction_i < grid_reduction_segment_size;
reduction_i += input_stride_for_thread_in_segment) {
auto work_buf_offset = reduction_i * block_reduction_segment_size +
block_reduction_segment_idx;
welfordCombine(
inp_avg,
inp_M2,
inp_N,
in_avg[work_buf_offset],
in_M2[work_buf_offset],
in_N[work_buf_offset]);
}
// Block reduce the per thread values into per "participating" thread values
T inp_avg_tmp = init_val;
T inp_M2_tmp = init_val;
TN inp_N_tmp = 0;
blockWelford<!X_THREAD, !Y_THREAD, !Z_THREAD, Aligned>(
inp_avg_tmp,
inp_M2_tmp,
inp_N_tmp,
inp_avg,
inp_M2,
inp_N,
shared_buf_avg,
shared_buf_M2,
shared_buf_N,
true,
init_val,
block_dim);
const bool should_write = (X_THREAD || threadIdx.x == 0) &&
(Y_THREAD || threadIdx.y == 0) && (Z_THREAD || threadIdx.z == 0);
if (should_write && write_pred) {
welfordCombine(out_avg, out_M2, out_N, inp_avg_tmp, inp_M2_tmp, inp_N_tmp);
}
}
// Grid welford combine. See GridReduction for more information
template <
bool X_BLOCK,
bool Y_BLOCK,
bool Z_BLOCK,
bool X_THREAD,
bool Y_THREAD,
bool Z_THREAD,
bool PERSISTENT_REDUCTION,
bool Aligned,
typename T,
typename TN,
typename BlockDimT>
__device__ void gridWelford(
T& out_avg,
T& out_M2,
TN& out_N,
const T& inp_avg,
const T& inp_M2,
const TN& inp_N,
volatile T* work_buf_avg,
volatile T* work_buf_M2,
volatile TN* work_buf_N,
Tensor<int64_t, 1> sync_flags,
T* shared_buf_avg,
T* shared_buf_M2,
TN* shared_buf_N,
bool read_pred,
bool write_pred,
T init_val,
const nvfuser_index_t entrance_ind,
const nvfuser_index_t n_entrances,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// entrance index only matters for non-persistent re-entrant grid reductions.
const nvfuser_index_t entrance_ind_ = PERSISTENT_REDUCTION ? 0 : entrance_ind;
const nvfuser_index_t n_entrances_ = PERSISTENT_REDUCTION ? 1 : n_entrances;
// Number of values to reduce in the reduction segment
const auto grid_reduction_segment_size =
index_utils::maskedSize<X_BLOCK, Y_BLOCK, Z_BLOCK>(gridDim);
// Index of the reduction we're performing out of the
// grid_reduction_segment_size
const auto idx_in_grid_segment =
index_utils::maskedOffset<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(
blockIdx, gridDim);
// Number of threads we can use in final reduction, Seems to assume all
// threads in the block participate
const auto block_reduction_segment_size =
index_utils::maskedSize<X_THREAD, Y_THREAD, Z_THREAD>(block_dim);
// Number of reductions in the grid
const nvfuser_index_t grid_segment_size = PERSISTENT_REDUCTION
? 1
: index_utils::maskedSize<!X_BLOCK, !Y_BLOCK, !Z_BLOCK>(gridDim);
// advance to the offset for this segment
// index of reduction * size of the reduction * size of threads
work_buf_avg += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
work_buf_M2 += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
work_buf_N += (entrance_ind_ * grid_segment_size + idx_in_grid_segment) *
grid_reduction_segment_size * block_reduction_segment_size;
if ((X_THREAD || threadIdx.x == 0) && (Y_THREAD || threadIdx.y == 0) &&
(Z_THREAD || threadIdx.z == 0)) {
auto block_offset =
index_utils::maskedOffset<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
auto thread_offset =
index_utils::maskedOffset<X_THREAD, Y_THREAD, Z_THREAD>(
threadIdx, block_dim);
auto work_buf_offset =
block_offset * block_reduction_segment_size + thread_offset;
if (read_pred) {
work_buf_avg[work_buf_offset] = inp_avg;
work_buf_M2[work_buf_offset] = inp_M2;
work_buf_N[work_buf_offset] = inp_N;
} else {
work_buf_avg[work_buf_offset] = init_val;
work_buf_M2[work_buf_offset] = init_val;
work_buf_N[work_buf_offset] = 0;
}
}
if (PERSISTENT_REDUCTION) {
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
} else {
// Use a different sync flag for each call
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[entrance_ind_ * grid_segment_size + idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
bool last_block =
index_utils::maskedIsLast<X_BLOCK, Y_BLOCK, Z_BLOCK>(blockIdx, gridDim);
if (last_block) {
// final reduction
gridWelfordLastBlock<X_THREAD, Y_THREAD, Z_THREAD, Aligned>(
out_avg,
out_M2,
out_N,
work_buf_avg,
work_buf_M2,
work_buf_N,
grid_reduction_segment_size,
block_reduction_segment_size,
block_dim,
shared_buf_avg,
shared_buf_M2,
shared_buf_N,
write_pred,
init_val);
}
if (PERSISTENT_REDUCTION) {
// Make sure we're done with global memory before we allow the kernel to
// continue
grid_sync::sync<X_BLOCK, Y_BLOCK, Z_BLOCK, PERSISTENT_REDUCTION, Aligned>(
sync_flags[idx_in_grid_segment],
grid_reduction_segment_size,
block_dim);
}
}
} // namespace welford
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace warp {
template <typename T>
__device__ __forceinline__ T shfl_xor(T var, int laneMask, int width = 32) {
return __shfl_xor_sync(0xffffffff, var, laneMask, width);
}
template <typename T>
__device__ __forceinline__ std::complex<T> shfl_xor(
std::complex<T> var,
int laneMask,
int width = 32) {
T real = __shfl_xor_sync(0xffffffff, var.real(), laneMask, width);
T imag = __shfl_xor_sync(0xffffffff, var.imag(), laneMask, width);
return std::complex<T>(real, imag);
}
template <
bool SINGLE_WARP,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void warpReduceTIDX(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
constexpr int WARP_SIZE = 32;
// Assume input padded to multiples of a warp
T reduce_val = init_val;
// Do warp reduction
if (read_write_pred) {
reduce_val = inp_val;
}
// Reduce within each warp
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, WARP_SIZE));
}
// Reduce across warp if needed
// Load value to shared mem
if (!SINGLE_WARP) {
unsigned int warp_idx = threadIdx.x / WARP_SIZE;
unsigned int lane_idx = threadIdx.x % WARP_SIZE;
unsigned int reduce_group_id = threadIdx.z * block_dim.y + threadIdx.y;
bool is_warp_head = lane_idx == 0;
unsigned int reduction_size = block_dim.x;
unsigned int num_of_warps = reduction_size / WARP_SIZE;
unsigned int smem_offset = reduce_group_id * num_of_warps;
block_sync::sync<Aligned>(block_dim);
if (is_warp_head) {
shared_mem[smem_offset + warp_idx] = reduce_val;
}
block_sync::sync<Aligned>(block_dim);
if (warp_idx == 0) {
// This assumes num_of_warps will be < 32, meaning < 1024 threads.
// Should be true for long enough.
assert(num_of_warps <= 32);
reduce_val = lane_idx < num_of_warps ? shared_mem[smem_offset + lane_idx]
: init_val;
// Reduce within warp 0
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, 32));
}
}
if (is_warp_head) {
reduction_op(out, reduce_val);
}
// needs sync, otherwise other warps may access shared memory before this
// reduction is done.
block_sync::sync<Aligned>(block_dim);
} else {
reduction_op(out, reduce_val);
}
}
template <
int BDIMX,
int BDIMY,
bool Aligned,
typename T,
typename Func,
typename BlockDimT>
__device__ void warpReduceTIDXY(
T& out,
const T& inp_val,
Func reduction_op,
T* shared_mem,
bool read_write_pred,
T init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
constexpr int WARP_SIZE = 32;
constexpr int num_of_warps = BDIMX * BDIMY / WARP_SIZE;
// Assume input padded to multiples of a warp
T reduce_val = init_val;
// Do warp reduction
if (read_write_pred) {
reduce_val = inp_val;
}
// Reduce within each warp
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, WARP_SIZE));
}
// Reduce across warp if needed
// Load value to shared mem
if (num_of_warps > 1) {
unsigned int idx = threadIdx.x + threadIdx.y * BDIMX;
unsigned int warp_idx = idx / WARP_SIZE;
unsigned int lane_idx = idx % WARP_SIZE;
block_sync::sync<Aligned>(block_dim);
if (lane_idx == 0) {
shared_mem[warp_idx] = reduce_val;
}
block_sync::sync<Aligned>(block_dim);
if (warp_idx == 0) {
reduce_val = lane_idx < num_of_warps ? shared_mem[lane_idx] : init_val;
// Reduce within warp 0
for (int i = 16; i >= 1; i /= 2) {
reduction_op(reduce_val, shfl_xor(reduce_val, i, 32));
}
}
if (lane_idx == 0) {
reduction_op(out, reduce_val);
}
// needs sync, otherwise other warps may access shared memory before this
// reduction is done.
block_sync::sync<Aligned>(block_dim);
} else {
reduction_op(out, reduce_val);
}
}
} // namespace warp
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
// Utility for converting generic pointer to SMEM pointer in PTX.
// We should review vectorized load/stores with shared memory.
// SMEM memory movement PTX is only Global -> SMEM, SMEM -> Local, Local ->
// SMEM, and this is needed for these PTX instructions to provide the SMEM
// pointer.
__device__ inline unsigned toSmem(const void* raw_ptr) {
unsigned smem_ptr_uint;
asm("{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %1; cvt.u32.u64 %0, smem_ptr; }\n"
: "=r"(smem_ptr_uint)
: "l"(raw_ptr));
return smem_ptr_uint;
}
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750))
namespace Turing {
// LdMatrix has .x1, .x2 and .x4 options, currently we actively use .x2 and
// .x4. In .x2 option. the the address register of upper half warp (lane 16-31)
// are un-used but on Turing [sm75,sm80) architecture these un-used addresses
// need to be valid, in the sense that:
// 1. The data it points to has to be within allocated shared mem buffer.
// 2. The address needs to be aligned to 16 byte.
// See also:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
// This function addresses 2. above by masking out the sub-16B component
// of the address in upper warp and 1. is guaranteed by ldmatrix swizzle
// util.
// This will **not** affect any functionality. This is just modification
// of unused pointers to satisfy the alignment requirement on Turing
// hardware.
// The alignment requirement is lifted on sm80+,
// so this function is a no-op on Ampere or above.
template <unsigned num_valid_addresses>
__device__ inline unsigned adjustPartialLdMatrixAddrInTuring(
unsigned addr_in_byte) {
const unsigned lane = threadIdx.x % 32;
if (lane >= num_valid_addresses) {
return 0;
}
return addr_in_byte;
}
} // namespace Turing
#endif // Arch 75
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
namespace Hopper {
// Description: Elect a leader thread from a set of threads in a warp
//
// The common pattern is to select any thread from the first warp without
// creating a serialized, peeling loop.
//
// Code example: threadIdx.x / 32 == 0 && ptx::elect_sync(~0)
//
// Compile Explorer Reference: https://ce.nvidia.com/z/d9x4q8
//
// Document Reference:
// https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-elect-sync
__device__ inline bool electSync(const uint32_t& membermask) {
uint32_t is_elected;
asm volatile(
"{\n\t .reg .pred P_OUT; \n\t"
"elect.sync _|P_OUT, %1;\n\t"
"selp.b32 %0, 1, 0, P_OUT; \n"
"}"
: "=r"(is_elected)
: "r"(membermask)
:);
return static_cast<bool>(is_elected);
}
// References:
//
// TMA:
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor
// https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/copy_sm90_tma.hpp
//
// Tensor map:
// https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
// UBLK:
// https://github.com/NVIDIA/cutlass/blob/main/include/cute/arch/copy_sm90_tma.hpp#L1400
// UBLK Load:
struct CpAsyncBulkG2SIndex {
const void* raw_gmem_addr;
uint32_t bytes;
uint32_t mbarrier;
};
__device__ inline void cpAsyncBulkG2S(
const CpAsyncBulkG2SIndex& src,
uint32_t smem_addr) {
asm volatile(
"cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];\n"
:
: "r"(smem_addr),
"l"(src.raw_gmem_addr),
"r"(src.bytes),
"r"(src.mbarrier)
: "memory");
}
// UBLK Store:
struct CpAsyncBulkS2GIndex {
const void* raw_gmem_addr;
uint32_t bytes;
};
__device__ inline void cpAsyncBulkS2G(
const CpAsyncBulkS2GIndex& dst,
uint32_t smem_addr) {
asm volatile("cp.async.bulk.global.shared::cta.bulk_group [%0], [%1], %2;\n"
:
: "l"(dst.raw_gmem_addr), "r"(smem_addr), "r"(dst.bytes)
: "memory");
}
// TMA Loads:
template <int dim>
struct CpAsyncBulkTensorTileG2SIndex {
const TensorMap* descriptor;
Array<int32_t, dim> crds;
uint32_t mbarrier;
};
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<1>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3}], [%2];"
:
: "r"(smem_addr), "l"(gmem_int_desc), "r"(src.mbarrier), "r"(src.crds[0])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<1>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster"
" [%0], [%1, {%3}], [%2], %4;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<2>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<2>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster"
" [%0], [%1, {%3, %4}], [%2], %5;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<3>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4, %5}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<3>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast_cluster"
" [%0], [%1, {%3, %4, %5}], [%2], %6;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<4>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4, %5, %6}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<4>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast_cluster"
" [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3]),
"h"(cta_mask)
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2S(
const CpAsyncBulkTensorTileG2SIndex<5>& src,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes"
" [%0], [%1, {%3, %4, %5, %6, %7}], [%2];"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3]),
"r"(src.crds[4])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileG2SMulticast(
const CpAsyncBulkTensorTileG2SIndex<5>& src,
uint32_t smem_addr,
uint16_t cta_mask) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(src.descriptor);
asm volatile(
"cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::complete_tx::bytes.multicast_cluster"
" [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
:
: "r"(smem_addr),
"l"(gmem_int_desc),
"r"(src.mbarrier),
"r"(src.crds[0]),
"r"(src.crds[1]),
"r"(src.crds[2]),
"r"(src.crds[3]),
"r"(src.crds[4]),
"h"(cta_mask)
: "memory");
}
// TMA Stores:
template <int dim>
struct CpAsyncBulkTensorTileS2GIndex {
const TensorMap* descriptor;
Array<int32_t, dim> crds;
};
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<1>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.1d.global.shared::cta.bulk_group [%0, {%2}], [%1];"
:
: "l"(gmem_int_desc), "r"(smem_addr), "r"(dest.crds[0])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<2>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.2d.global.shared::cta.bulk_group [%0, {%2, %3}], [%1];"
:
: "l"(gmem_int_desc), "r"(smem_addr), "r"(dest.crds[0]), "r"(dest.crds[1])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<3>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.3d.global.shared::cta.bulk_group [%0, {%2, %3, %4}], [%1];"
:
: "l"(gmem_int_desc),
"r"(smem_addr),
"r"(dest.crds[0]),
"r"(dest.crds[1]),
"r"(dest.crds[2])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<4>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.4d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5}], [%1];"
:
: "l"(gmem_int_desc),
"r"(smem_addr),
"r"(dest.crds[0]),
"r"(dest.crds[1]),
"r"(dest.crds[2]),
"r"(dest.crds[3])
: "memory");
}
__device__ inline void cpAsyncBulkTensorTileS2G(
const CpAsyncBulkTensorTileS2GIndex<5>& dest,
uint32_t smem_addr) {
uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(dest.descriptor);
asm volatile(
"cp.async.bulk.tensor.5d.global.shared::cta.bulk_group [%0, {%2, %3, %4, %5, %6}], [%1];"
:
: "l"(gmem_int_desc),
"r"(smem_addr),
"r"(dest.crds[0]),
"r"(dest.crds[1]),
"r"(dest.crds[2]),
"r"(dest.crds[3]),
"r"(dest.crds[4])
: "memory");
}
} // namespace Hopper
#endif // Arch 90
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
// Tuple of Welford avg, var and N parameters.
//
// Template parameters:
// - DataTypeT: Type of avg and var
// - IndexTypeT: Type of N
// - MakeTuple: Template template parameter to define Tuple types
// (e.g., MakeLocalTuple>
template <
int NumVals,
typename DataTypeT,
typename IndexTypeT,
template <int, typename> typename MakeTuple>
struct WelfordTripletTuple {
static constexpr int num_vals = NumVals;
using DataType = DataTypeT;
using IndexType = IndexTypeT;
using DataTuple = typename MakeTuple<NumVals, DataType>::type;
using IndexTuple = typename MakeTuple<NumVals, IndexType>::type;
DataTuple avg;
DataTuple var;
IndexTuple N;
WelfordTripletTuple(
const DataTuple& avg,
const DataTuple& var,
const IndexTuple& N)
: avg(avg), var(var), N(N) {}
};
template <int NumVals, typename DataType, typename IndexType>
using LocalWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataType, IndexType, MakeLocalTuple>;
template <int NumVals, typename DataType, typename IndexType>
using RefWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataType, IndexType, MakeRefTuple>;
template <int NumVals, typename DataType, typename IndexType>
using ConstRefWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataType, IndexType, MakeConstRefTuple>;
template <int NumVals, typename DataTypeT, typename IndexTypeT>
using VolatilePtrWelfordTripletTuple =
WelfordTripletTuple<NumVals, DataTypeT, IndexTypeT, MakeVolatilePtrTuple>;
// Advance pointer offsets of WelfordTripleTuple. Only valid when the
// values are pointer values.
template <typename WelfordTripletTupleType>
__inline__ __device__ static void operator+=(
WelfordTripletTupleType& triplet,
nvfuser_index_t offset) {
triplet.avg += offset;
triplet.var += offset;
triplet.N += offset;
}
// Copy each of the triplet tuples
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyWelfordTripletTuple(
DstType& dst,
nvfuser_index_t dst_offset,
const SrcType& src,
nvfuser_index_t src_offset = 0) {
copyTuple(dst.avg, dst_offset, src.avg, src_offset);
copyTuple(dst.var, dst_offset, src.var, src_offset);
copyTuple(dst.N, dst_offset, src.N, src_offset);
}
// Copy each of the triplet tuples
template <typename DstType, typename SrcType>
__inline__ __device__ static void copyWelfordTripletTuple(
DstType& dst,
const SrcType& src,
nvfuser_index_t src_offset = 0) {
copyWelfordTripletTuple(dst, 0, src, src_offset);
}
// Copy each of the triplet tuples
template <typename DstType, typename SrcType, typename PredType>
__inline__ __device__ static void copyWelfordTripletTupleIf(
DstType& dst,
const SrcType& src,
const PredType& pred) {
copyTupleIf(dst.avg, src.avg, pred);
copyTupleIf(dst.var, src.var, pred);
copyTupleIf(dst.N, src.N, pred);
}
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
//! Suppose f_i be the i-th function of the binary function
//! parameters. Call the function as: f_i(x, y)
template <int i, typename DataType, typename Func, typename... Funcs>
struct FuncSelector {
static __device__ void call(
DataType& x,
const DataType y,
Func f,
Funcs... funcs) {
// Here, i is guaranteed to be larger than 0 as there's a
// specialization for i == 0 below. Recursively call FuncSelector
// by dropping f and decrementing i.
FuncSelector<i - 1, DataType, Funcs...>::call(x, y, funcs...);
}
};
//! Specialization of FuncSelector when i == 0, so f_i is f.
template <typename DataType, typename Func, typename... Funcs>
struct FuncSelector<0, DataType, Func, Funcs...> {
static __device__ void call(
DataType& x,
const DataType y,
Func f,
Funcs... funcs) {
f(x, y);
}
};
//! Call each of the first i+1 functions with the first i+1 values of
//! tuples. Here, i is guaranteed to be larger than -1 as there's a
//! specialization for i == -1.
template <int i, typename TupleType0, typename TupleType1, typename... Funcs>
struct FuncForEach {
static __device__ void call(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... funcs) {
static_assert(
IsSameType<
typename TupleType0::template ValType<i>,
typename TupleType1::template ValType<i>>::value,
"Invalid tuple types");
// Process the first i functions first.
FuncForEach<i - 1, TupleType0, TupleType1, Funcs...>::call(
val0, offset0, val1, offset1, funcs...);
// Call the i+1-th function
FuncSelector<i, typename TupleType0::template ValType<i>, Funcs...>::call(
val0.val<i>(offset0), val1.val<i>(offset1), funcs...);
}
};
//! Specialization of FuncForEach when i == -1, which means no
//! function to call. Just for stopping the recursive pattern here.
template <typename TupleType0, typename TupleType1, typename... Funcs>
struct FuncForEach<-1, TupleType0, TupleType1, Funcs...> {
static __device__ void call(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... funcs) {}
};
//! Reduce one value of a tuple using one of the reduction ops. The
//! value at val_idx is reduced by the function at func_idx.
template <
int func_idx,
int val_idx,
typename TupleType0,
typename TupleType1,
typename... Funcs>
__inline__ __device__ static void reduceVal(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... reduction_ops) {
static_assert(
IsSameType<
typename TupleType0::template ValType<val_idx>,
typename TupleType1::template ValType<val_idx>>::value,
"Invalid tuple types");
FuncSelector<
func_idx,
typename TupleType0::template ValType<val_idx>,
Funcs...>::
call(
val0.val<val_idx>(offset0),
val1.val<val_idx>(offset1),
reduction_ops...);
}
//! Accumulate each value of a given pair of tuples using its corresponding
//! function. Suppose f_i be the i-th reduciton function. Call f_i as:
//! f_i(val0.val<i>(offset0), val1.val<i>(offset1)).
template <typename TupleType0, typename TupleType1, typename... Funcs>
__inline__ __device__ static void reduceEach(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Funcs... reduction_ops) {
constexpr int num_funcs = sizeof...(reduction_ops);
FuncForEach<num_funcs - 1, TupleType0, TupleType1, Funcs...>::call(
val0, offset0, val1, offset1, reduction_ops...);
}
template <typename TupleType0, typename TupleType1, typename Func, int num_vals>
struct TupleReduce {};
template <typename TupleType0, typename TupleType1, typename Func>
struct TupleReduce<TupleType0, TupleType1, Func, 1> {
__inline__ __device__ static void reduce(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
IsSameType<
typename TupleType0::ValTypes,
typename TupleType1::ValTypes>::value,
"Invalid value types");
reduction_op(val0.val<0>(offset0), val1.val<0>(offset1));
}
};
template <typename TupleType0, typename TupleType1, typename Func>
struct TupleReduce<TupleType0, TupleType1, Func, 2> {
__inline__ __device__ static void reduce(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
IsSameType<
typename TupleType0::ValTypes,
typename TupleType1::ValTypes>::value,
"Invalid value types");
reduction_op(
val0.val<0>(offset0),
val0.val<1>(offset0),
val1.val<0>(offset1),
val1.val<1>(offset1));
}
};
template <typename TupleType0, typename TupleType1, typename Func>
struct TupleReduce<TupleType0, TupleType1, Func, 3> {
__inline__ __device__ static void reduce(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
IsSameType<
typename TupleType0::ValTypes,
typename TupleType1::ValTypes>::value,
"Invalid value types");
reduction_op(
val0.val<0>(offset0),
val0.val<1>(offset0),
val0.val<2>(offset0),
val1.val<0>(offset1),
val1.val<1>(offset1),
val1.val<2>(offset1));
}
};
//! Reduce all values of a tuple together. The reduction function must
//! have the same number of inputs as the number of values of each tuple.
template <typename TupleType0, typename TupleType1, typename Func>
__inline__ __device__ void reduceTuple(
TupleType0& val0,
nvfuser_index_t offset0,
const TupleType1& val1,
nvfuser_index_t offset1,
Func reduction_op) {
static_assert(
TupleType0::num_vals == TupleType1::num_vals, "Invalid number of values");
TupleReduce<TupleType0, TupleType1, Func, TupleType0::num_vals>::reduce(
val0, offset0, val1, offset1, reduction_op);
}
// Reduces all of the first (idx+1) values by a thread block
template <
int idx,
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
typename LocalTupleT,
typename BlockDimT,
typename... Funcs>
struct BlockReduceEach {
__inline__ __device__ static void reduce(
LocalTupleT& block_result,
const LocalTupleT& partial_result,
void* shared_mem,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Funcs... funcs) {
// Finish the reduction of each tuple value with a smaller offset
BlockReduceEach<
idx - 1,
BROADCAST,
true,
Aligned,
LocalTupleT,
BlockDimT,
Funcs...>::
reduce(
block_result,
partial_result,
shared_mem,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim,
funcs...);
if (num_elements_per_reduction == 1) {
if (has_block_result) {
block_result.val<idx>(0) = partial_result.val<idx>(0);
}
return;
}
using DataType = typename LocalTupleT::template ValType<idx>;
PtrTuple<DataType> shared_buf(static_cast<DataType*>(shared_mem));
LocalTuple<DataType> block_result_i(partial_result.val<idx>(0));
const auto smem_offset =
reduction_idx * num_threads_per_reduction + tid_in_reduction;
const int np2 = 1 << (31 - __clz(num_elements_per_reduction));
// Threads values are initialized, so all can participate here
if (tid_in_reduction >= np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
block_sync::sync<Aligned>(block_dim);
if (tid_in_reduction < np2 &&
tid_in_reduction + np2 < num_elements_per_reduction) {
impl::reduceVal<idx, 0>(
block_result_i, 0, shared_buf, smem_offset + np2, funcs...);
}
if (tid_in_reduction < np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
// Always sync when communicating across smem
block_sync::sync<Aligned>(block_dim);
// Reduce down to 2 values, last thread will do the final reduction and
// can save a syncthreads this way
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_reduction < factor) {
impl::reduceVal<idx, 0>(
shared_buf,
smem_offset,
shared_buf,
smem_offset + factor,
funcs...);
}
block_sync::sync<Aligned>(block_dim);
}
copyTuple(block_result_i, shared_buf, smem_offset);
// Do the last reduction
if (has_block_result) {
impl::reduceVal<idx, 0>(
block_result_i, 0, shared_buf, smem_offset + 1, funcs...);
}
if (BROADCAST) {
if (has_block_result) {
// Put result back in shared memory, put in the first entry of the
// reduction segment's buffer
copyTuple(
shared_buf,
reduction_idx * num_threads_per_reduction,
block_result_i);
}
// Sync threads to make sure result is in smem
block_sync::sync<Aligned>(block_dim);
copyTuple(
block_result_i,
shared_buf,
reduction_idx * num_threads_per_reduction);
}
block_result.val<idx>(0) = block_result_i.val<0>(0);
if (FORWARD_PROTECT_SMEM) {
block_sync::sync<Aligned>(block_dim);
}
}
};
// Specialization for idx == -1, i.e., no value to reduce.
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
typename LocalTupleT,
typename BlockDimT,
typename... Funcs>
struct BlockReduceEach<
-1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
LocalTupleT,
BlockDimT,
Funcs...> {
__inline__ __device__ static void reduce(
LocalTupleT& block_result,
const LocalTupleT& partial_result,
void* shared_mem,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Funcs... funcs) {}
};
//! Reduce each value of a tuple by a thread block.
//!
//! The final result is broadcast when BROADCAST is true.
//!
//! \param block_result result of the block reduction
//! \param partial_result Per-thread input tuple
//! \param shared_mem
//! \param has_block_result
//! \param tid_in_reduction
//! \param num_threads_per_reduction
//! \param num_elements_per_reduction
//! \param reduction_idx
//! \param reduction_ops
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
typename LocalTupleT,
typename BlockDimT,
typename... Funcs>
__inline__ __device__ void blockReduceEach(
LocalTupleT& block_result,
const LocalTupleT& partial_result,
void* shared_mem,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Funcs... reduction_ops) {
BlockReduceEach<
LocalTupleT::num_vals - 1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
LocalTupleT,
BlockDimT,
Funcs...>::
reduce(
block_result,
partial_result,
shared_mem,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim,
reduction_ops...);
}
} // namespace impl
// We have 6 dimensions, 3 in the grid, 3 in the block
// They can be 1 of 3 states,
// Reduction Domain - TEMPLATE STATE 0
// - Participating in the reduction, has values coming in, one value coming
// out across the dimension
// Iteration Domain - TEMPLATE STATE 1
// - Not participating in the reduction, has values across the dimension after
// the reduction
// Collapsed Domain - TEMPLATE STATE 2
// - Previously reduced, doesn't need to be reduced on that dimension, doesn't
// have values across that dimension
constexpr __device__ bool isReduce(int STATE) {
return STATE == 0;
}
constexpr __device__ bool isIter(int STATE) {
return STATE == 1;
}
constexpr __device__ bool isPred(int STATE) {
return STATE == 2;
}
constexpr __device__ bool inactive(int STATE) {
return STATE == 3;
}
constexpr __device__ bool activeNotIter(int STATE) {
return STATE != 3 && STATE != 1;
}
constexpr __device__ bool isReduceOrIter(int STATE) {
return isReduce(STATE) || isIter(STATE);
}
// When generating an index into the reduction, we have to stride by iteration
// domains and reduction domains. Collapsed domains we can ignore, but we need
// to make sure they never read or write (need to be predicated to correct
// participation).
// All inclusive reduction with option to re-broadcast. This reduction class
// does not use predication of parallelization in the read or write predicates.
// Instead there are 3 states each dimension of parallelization can have,
// described above. Predication, indexing, and reduction will be done based on
// this information.
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
class ParallelReduce {
static_assert(
!BROADCAST || PERSISTENT_REDUCTION,
"Broadcast requires persistent reduction");
static constexpr bool BLOCK_REDUCE =
isReduce(X_THREAD) || isReduce(Y_THREAD) || isReduce(Z_THREAD);
static constexpr bool GRID_REDUCE =
isReduce(X_BLOCK) || isReduce(Y_BLOCK) || isReduce(Z_BLOCK);
// ping-pong between global buffers to avoid a second sync
bool flip = false;
public:
__device__ ParallelReduce() {}
// reduceGroup does not support Welford-style reductions that reduce
// all values of a tuple together, so this is the only entry point
// for Welford for now.
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op);
//! Profiled version
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op,
int64_t& cycles,
int64_t& count);
//! Each value of a tuple is independently reduced by the
//! corresponding reduction op. Thus, Welford-like reductions are
//! not supported by this interface.
//!
//! Note that out is purely used as the output parameter, and its
//! initial value is not used but just overwritten. Since grid
//! reductions do not allow serial reduction IterDomains, there is
//! no need to accumulate into the out parameter.
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
Funcs... funcs);
//! Profiled version
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
int64_t& cycles,
int64_t& count,
Funcs... funcs);
// User-visible entry point of grouped grid welford +
// broadcast. Mostly the same as reduceGroup, and it would be
// possible to combine this to reduceGroup, but it might make the
// templated data structures even more complicated and difficult to
// understand. For now, keep it as a separate function.
//
// Unlike reduceGroup, though, the data types of welford ops must be
// the same. For example, reduceGroup can be used to reduce half and
// float values by passing a tuple of, e.g., LocalTuple<half,
// float>, but that's not supported here for implementation
// simplicity. In practice, it should be really uncommon to group
// welford ops with different data types, so this restriction
// shouldn't be an issue.
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds);
//! Profiled version
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds,
int64_t& cycles,
int64_t& count);
// This is highly specific to the outer-reduction pattern. All the
// assumptions should be asserted with static_assert at the begging of
// the fuction.
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer);
// Profiled version
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer,
int64_t& cycles,
int64_t& count);
private:
__device__ static bool isLastBlockInGrid() {
return index_utils::maskedIsLast<
isReduceOrIter(X_BLOCK),
isReduceOrIter(Y_BLOCK),
isReduceOrIter(Z_BLOCK)>(blockIdx, gridDim) &&
index_utils::maskedIsZero<
!isReduceOrIter(X_BLOCK),
!isReduceOrIter(Y_BLOCK),
!isReduceOrIter(Z_BLOCK)>(blockIdx);
}
//! Initial per-CTA reduction of each value of a tuple. Each value
//! is reduced individually, so the shared memory buffer just needs
//! to be large enough for each value. NOTE that the smem buffer is
//! not forward protected.
template <
bool BLOCK_BROADCAST,
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ static LocalTuple<DataTypes...> reduceGroupBlock(
const ConstRefTuple<DataTypes...>& inp,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
bool block_reduce_participate,
Funcs... funcs);
//! Final reduction of partial results. Done by all blocks
//! redundantly when BROADCAST is true, or just one block otherwise.
//! The smem buffer is assumed synchronized when it is passed in,
//! but it isn't synchronized when returning from this function.
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ static void reduceGroupLastBlock(
RefTuple<DataTypes...>& out,
const VolatilePtrTuple<DataTypes...>& global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const LocalTuple<BoolTypes...>& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate,
Funcs... reduction_ops);
//! Welford version of reduceGroupBlock
template <
bool BLOCK_BROADCAST,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ static void welfordGroupBlock(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const ConstRefWelfordTripletTuple<NumVals, DataType, IndexType>& inp,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumVals, bool>::type& read_preds,
bool block_reduce_participate);
//! Welford version of reduceGrouplLastBlock
template <
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ static void welfordGroupLastBlock(
RefWelfordTripletTuple<NumVals, DataType, IndexType>& out,
const VolatilePtrWelfordTripletTuple<NumVals, DataType, IndexType>&
global_work_buffer,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const typename MakeLocalTuple<NumVals, bool>::type& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate);
// End Parallel reduce class
};
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op) {
// If no reduction needed, just return input
if (!BLOCK_REDUCE && !GRID_REDUCE) {
if (read_pred && write_pred) {
out = inp;
}
return;
}
// Don't read/write in temporary buffers if in a predicated dimension
bool block_reduce_participate = index_utils::
maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
threadIdx);
// Initialize block result
LocalTuple<Types...> block_result = init_val;
// Grab input data if participating in the reduction, set to block_result in
// the case there is no block reduction
if (block_reduce_participate && read_pred) {
block_result = inp;
}
// Only threads that with id == 0 in the dimensions being reduced will
// have a valid result
bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
if (BLOCK_REDUCE) {
// -- START BLOCK REDUCTION -- //
// Size of the block reduction segment, can be an int since it's limited
// to number of threads
int block_reduction_size = index_utils::
maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
block_dim);
// Index in the reduction segment, can be an int since it's limited to
// number of threads
int tid_in_block_reduction = index_utils::maskedOffset<
isReduce(X_THREAD),
isReduce(Y_THREAD),
isReduce(Z_THREAD)>(threadIdx, block_dim);
// ID of the block reduction this thread is participating in
//
// If any of the parallel dimensions are predicated out, that means
// they've already been reduced, so we only care about the first thread in
// that dimension. Therefore don't expand the reduction_idx by that
// dimension
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Shared memory buffer is 2D
// [iter dimension, reduction dimension]
// Offset into smem for the current thread
int block_reduce_smem_offset =
block_reduction_idx * block_reduction_size + tid_in_block_reduction;
// Initialize shared memory
if (block_reduce_participate) {
copyTuple(shared_buf, block_reduce_smem_offset, block_result);
}
// Sync to make sure smem is completely initialized
block_sync::sync<Aligned>(block_dim);
// Round reduction size down to nearest power of 2
int np2 = 1 << (31 - __clz(block_reduction_size));
// Perform an initial reduction leaving np2 elements
if (block_reduce_participate && tid_in_block_reduction < np2 &&
tid_in_block_reduction + np2 < block_reduction_size) {
impl::reduceTuple(
shared_buf,
block_reduce_smem_offset,
shared_buf,
block_reduce_smem_offset + np2,
reduction_op);
}
// Always need to sync while operating on shared memory
block_sync::sync<Aligned>(block_dim);
// Reduce down until 2 values, leaving 2 values allows us to manually
// perform the last reduction and avoid a syncthreads
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_block_reduction < factor && block_reduce_participate) {
impl::reduceTuple(
shared_buf,
block_reduce_smem_offset,
shared_buf,
block_reduce_smem_offset + factor,
reduction_op);
}
block_sync::sync<Aligned>(block_dim);
}
// Accumulate that last valid result
if (has_block_result) {
copyTuple(block_result, shared_buf, block_reduce_smem_offset);
if (block_reduction_size > 1) {
impl::reduceTuple(
block_result,
0,
shared_buf,
block_reduce_smem_offset + 1,
reduction_op);
}
}
// ===== BLOCK REDUCTION CLEANUP =======
if (!GRID_REDUCE) {
// If no grid reduction, we don't have to continue. Either broadcast
// back across the block or return the correct reduction
if (has_block_result && write_pred) {
impl::reduceTuple(block_result, 0, out, 0, reduction_op);
out = block_result;
}
if (BROADCAST) {
// No grid reduce, but need to broadcast, perform block broadcast
if (has_block_result && write_pred) {
// Put result back in shared memory, put in the first entry of the
// reduction segment's buffer
copyTuple(
shared_buf,
block_reduction_idx * block_reduction_size,
block_result);
}
// Sync threads to make sure result is in smem
block_sync::sync<Aligned>(block_dim);
// If the thread is participating, and is not attempting to write out
// of bounds, return the broadcasted value.
if (block_reduce_participate && write_pred) {
copyTuple(
out, shared_buf, block_reduction_idx * block_reduction_size);
}
}
// Forward protect shared memory, don't want threads to continue to
// another reduction/broadcast and pollute shared memory before the
// reduction is completely finished.
//
// This could be avoided in some cases if we added thread syncs from
// block reductions in the syncthread insertion pass.
block_sync::sync<Aligned>(block_dim);
return;
}
}
// -- START GRID REDUCTION -- //
// Grid reductions are more challenging for two reasons, (1) the reduction
// itself is 3D instead of 2D because we now have an iter domain space in
// the grid dimension. (2) a tree reduction isn't performed, instead all
// blocks will populate GMEM and one block will finish the grid reduction.
// What is the grid reduction size, block reduction already performed so
// that doesn't have to be taken into consideration
const auto grid_red_size = index_utils::
maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
gridDim);
// Which ID in the reduction is this block. Threads can participate in
// multiple grid reductions, but the block will have the same relative index
// in those reductions
const auto idx_in_grid_red = index_utils::
maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size =
index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
gridDim) *
index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim) *
grid_red_size;
global_work_buffer += global_buffer_size;
}
flip = !flip;
// How many grid reductions have to be performed, in the grid dimension
const auto num_block_iters = index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
// Which grid reduction does this block participate in, in the grid
// dimension
const auto block_red_idx_offset = index_utils::
maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the block dimension
const auto num_thread_iters = index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim);
// Which grid reduction does this thread participate in, in the block
// dimension
const auto thread_red_idx_offset = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Offset into the work buffer
const auto work_buf_offset =
(idx_in_grid_red * num_block_iters + block_red_idx_offset) *
num_thread_iters +
thread_red_idx_offset;
// Don't read/write in temporary buffers if in a predicated dimension
bool grid_reduce_participate = index_utils::
maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(blockIdx);
if (grid_reduce_participate && block_reduce_participate) {
if (has_block_result) {
copyTuple(global_work_buffer, work_buf_offset, block_result);
}
}
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (grid_reduce_participate) {
// Don't need to sync up blocks that are not participating in this
// reduction
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[block_red_idx_offset],
grid_red_size,
last_block,
block_dim);
}
// -- START BLOCK CLEANUP -- //
// All blocks perform the last cleanup, so every block, and every thread
// will have the final result
// Initialize block result
LocalTuple<Types...> last_block_result(init_val);
if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
// Can use the last block to reduce all the values the blocks filled in.
// Can use any thread that has been predicated, or has been reduced to do
// this reduction, cannot use any block that's associated with an
// iteration domain
// Start with non-block reduction
// Index in the reduction segment
int tid_in_block_reduction_2 = index_utils::maskedOffset<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx, block_dim);
int block_reduction_size_2 = index_utils::maskedSize<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Change the offset, we want to keep the last two dimensions, but the
// first dimension is what we will reduce over
const auto work_buf_offset_2 =
block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
for (auto reduction_i = tid_in_block_reduction_2;
reduction_i < grid_red_size;
reduction_i += block_reduction_size_2) {
impl::reduceTuple(
last_block_result,
0,
global_work_buffer,
work_buf_offset_2 +
reduction_i * num_block_iters *
num_thread_iters, // Iterating over the outer most
// dimension, so need to stride by the
// total number of grid reductions. Could
// come back and change it so this is the
// contiguous dimension
reduction_op);
}
// -- START LAST BLOCK - BLOCK REDUCTION -- //
// Reduced so we have one value per thread, we need to further reduce any
// dimension that is not an iter dimension
// Which block reduction this thread is participating in
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Offset in smem for this thread's result
auto smem_offset =
block_reduction_idx * block_reduction_size_2 + tid_in_block_reduction_2;
// Similar as before, reduce down to nearest power of 2 so we can do a
// tree reduction
int np2 = 1 << (31 - __clz(min(block_reduction_size_2, grid_red_size)));
// Threads values are initialized, so all can participate here
if (tid_in_block_reduction_2 >= np2) {
copyTuple(shared_buf, smem_offset, last_block_result);
}
block_sync::sync<Aligned>(block_dim);
if (tid_in_block_reduction_2 < np2 &&
tid_in_block_reduction_2 + np2 <
min(block_reduction_size_2, grid_red_size)) {
impl::reduceTuple(
last_block_result, 0, shared_buf, smem_offset + np2, reduction_op);
}
if (tid_in_block_reduction_2 < np2) {
copyTuple(shared_buf, smem_offset, last_block_result);
}
// Always sync when communicating across smem
block_sync::sync<Aligned>(block_dim);
// Reduce down to 2 values, last thread will do the final reduction and
// can save a syncthreads this way
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_block_reduction_2 < factor) {
impl::reduceTuple(
shared_buf,
smem_offset,
shared_buf,
smem_offset + factor,
reduction_op);
}
block_sync::sync<Aligned>(block_dim);
}
// If this thread in each block has the final result before broadcasting
// to all other threads in block
bool has_block_result_2 = index_utils::maskedIsZero<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx);
// Do the last reduction, protected by the write predicate
copyTuple(last_block_result, shared_buf, smem_offset);
if (has_block_result && grid_reduce_participate) {
impl::reduceTuple(last_block_result, 0, out, 0, reduction_op);
if (min(block_reduction_size_2, grid_red_size) > 1) {
impl::reduceTuple(
last_block_result, 0, shared_buf, smem_offset + 1, reduction_op);
}
}
if (grid_reduce_participate && PERSISTENT_REDUCTION) {
// If persistent reduction, always broadcast reduced values
copyTuple(shared_buf, smem_offset, last_block_result);
block_sync::sync<Aligned>(block_dim);
if (write_pred && block_reduce_participate) {
copyTuple(
out, shared_buf, block_reduction_idx * block_reduction_size_2);
}
// For persistent kernels we double the global buffer allocation so we
// don't need to protect those buffers every iteration preventing the
// need of an additional grid_sync. Since we flip back and forth between
// sections of the buffer, the one grid sync protects the other part of
// the buffer.
} else {
if (grid_reduce_participate) {
if (last_block && has_block_result && block_reduce_participate &&
write_pred) {
copyTuple(
out, shared_buf, block_reduction_idx * block_reduction_size_2);
}
}
}
// Forward protect the smem used in this reduction
block_sync::sync<Aligned>(block_dim);
}
}
//! Profiled version
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <bool Aligned, typename Func, typename BlockDimT, typename... Types>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduce(
RefTuple<Types...> out,
const ConstRefTuple<Types...>& inp,
VolatilePtrTuple<Types...> global_work_buffer,
int64_t* global_sync_buffer, // Allocated as product of all
// non-participating Grid dimension
PtrTuple<Types...> shared_buf,
bool read_pred, // Prevent reading from out of bounds memory
bool write_pred, // Prevent from writing out of bounds
const LocalTuple<Types...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
Func reduction_op,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
reduce<Aligned>(
out,
inp,
global_work_buffer,
global_sync_buffer,
shared_buf,
read_pred,
write_pred,
init_val,
block_dim,
reduction_op);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
Funcs... funcs) {
static_assert(
sizeof...(DataTypes) == sizeof...(Funcs),
"Mismatched number of Tuple values and functions");
static_assert(
sizeof...(DataTypes) == sizeof...(BoolTypes),
"Mismatched number of Tuple values and predicate values");
// If no reduction needed, just return input
if (!BLOCK_REDUCE && !GRID_REDUCE) {
copyTupleIf(out, inp, read_preds && write_preds);
return;
}
// Don't read/write in temporary buffers if in a predicated dimension
const bool block_reduce_participate = index_utils::
maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
threadIdx);
// Only threads that with id == 0 in the dimensions being reduced will
// have a valid result
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
// Initial per-block reduction. Result is broadcast if specified
// and this call is block reduction only.
const auto block_result = reduceGroupBlock < !GRID_REDUCE && BROADCAST,
Aligned > (inp,
init_val,
block_dim,
shared_mem,
read_preds,
block_reduce_participate,
funcs...);
// If block reduction only, save to out and exit
if (!GRID_REDUCE) {
copyTupleIf(
out,
block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
// Need a block sync here as reduceGroupBlock does not
// forward-protect the smem buffer. This block sync is not
// necessary when a grid reduction follows since a block sync is
// done just before the grid sync.
block_sync::sync<Aligned>(block_dim);
return;
}
// -- START GRID REDUCTION -- //
// Grid reductions are more challenging for two reasons, (1) the reduction
// itself is 3D instead of 2D because we now have an iter domain space in
// the grid dimension. (2) a tree reduction isn't performed, instead all
// blocks will populate GMEM and one block will finish the grid reduction.
// What is the grid reduction size, block reduction already performed so
// that doesn't have to be taken into consideration
const auto grid_red_size = index_utils::
maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
gridDim);
// Which ID in the reduction is this block. Threads can participate in
// multiple grid reductions, but the block will have the same relative index
// in those reductions
const auto idx_in_grid_red = index_utils::
maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the grid dimension
const auto num_block_iters = index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
// Which grid reduction does this block participate in, in the grid
// dimension
const auto block_red_idx_offset = index_utils::
maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the block dimension
const auto num_thread_iters = index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim);
// Which grid reduction does this thread participate in, in the block
// dimension
const auto thread_red_idx_offset = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Offset into the work buffer
const auto work_buf_offset =
(idx_in_grid_red * num_block_iters + block_red_idx_offset) *
num_thread_iters +
thread_red_idx_offset;
// Don't read/write in temporary buffers if in a predicated dimension
bool grid_reduce_participate = index_utils::
maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(blockIdx);
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size =
index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
gridDim) *
index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim) *
grid_red_size;
global_work_buffer += global_buffer_size;
}
flip = !flip;
// Per-block partial reduction to global work buffer
if (grid_reduce_participate && block_reduce_participate && has_block_result) {
copyTuple(global_work_buffer, work_buf_offset, block_result);
}
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (grid_reduce_participate) {
// Don't need to sync up blocks that are not participating in this
// reduction
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[block_red_idx_offset],
grid_red_size,
last_block,
block_dim);
}
// -- START BLOCK CLEANUP -- //
reduceGroupLastBlock<Aligned>(
out,
global_work_buffer,
init_val,
block_dim,
shared_mem,
block_red_idx_offset,
num_thread_iters,
num_block_iters,
thread_red_idx_offset,
grid_red_size,
write_preds,
block_reduce_participate,
grid_reduce_participate,
funcs...);
// Forward protect the smem buffer
block_sync::sync<Aligned>(block_dim);
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroup(
RefTuple<DataTypes...> out,
const ConstRefTuple<DataTypes...>& inp,
VolatilePtrTuple<DataTypes...> global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
int64_t* global_sync_buffer,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
const LocalTuple<BoolTypes...>& write_preds,
int64_t& cycles,
int64_t& count,
Funcs... funcs) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
reduceGroup<Aligned>(
out,
inp,
global_work_buffer,
init_val,
block_dim,
global_sync_buffer,
shared_mem,
read_preds,
write_preds,
funcs...);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool BLOCK_BROADCAST,
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ LocalTuple<DataTypes...> ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroupBlock(
const ConstRefTuple<DataTypes...>& inp,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
const LocalTuple<BoolTypes...>& read_preds,
bool block_reduce_participate,
Funcs... funcs) {
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
// Initialize block result
LocalTuple<DataTypes...> block_result = init_val;
copyTupleIf(block_result, inp, block_reduce_participate && read_preds);
// Size of the block reduction segment, can be an int since it's limited
// to number of threads
const int block_reduction_size = index_utils::
maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
block_dim);
// Index in the reduction segment, can be an int since it's limited to
// number of threads
const int tid_in_block_reduction = index_utils::
maskedOffset<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx, block_dim);
// ID of the block reduction this thread is participating in
//
// If any of the parallel dimensions are predicated out, that means
// they've already been reduced, so we only care about the first thread in
// that dimension. Therefore don't expand the reduction_idx by that
// dimension
const int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Do not protect the smem buffer as it's not always necessary.
impl::blockReduceEach<
BLOCK_BROADCAST,
false,
Aligned,
LocalTuple<DataTypes...>,
BlockDimT,
Funcs...>(
block_result,
block_result,
shared_mem,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
block_reduction_size,
block_reduction_idx,
block_dim,
funcs...);
return block_result;
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
typename BlockDimT,
typename... DataTypes,
typename... Funcs,
typename... BoolTypes>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
reduceGroupLastBlock(
RefTuple<DataTypes...>& out,
const VolatilePtrTuple<DataTypes...>& global_work_buffer,
const LocalTuple<DataTypes...>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
void* shared_mem,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const LocalTuple<BoolTypes...>& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate,
Funcs... reduction_ops) {
// Initialize block result
LocalTuple<DataTypes...> last_block_result(init_val);
const bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
// Can use the last block to reduce all the values the blocks filled in.
// Can use any thread that has been predicated, or has been reduced to do
// this reduction, cannot use any block that's associated with an
// iteration domain
// Start with non-block reduction
// Index in the reduction segment
int tid_in_block_reduction = index_utils::maskedOffset<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx, block_dim);
int block_reduction_size = index_utils::maskedSize<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(block_dim);
bool has_block_result = index_utils::maskedIsZero<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Change the offset, we want to keep the last two dimensions, but the
// first dimension is what we will reduce over
const auto work_buf_offset =
block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
for (auto reduction_i = tid_in_block_reduction; reduction_i < grid_red_size;
reduction_i += block_reduction_size) {
impl::reduceEach(
last_block_result,
0,
global_work_buffer,
work_buf_offset +
reduction_i * num_block_iters *
num_thread_iters, // Iterating over the outer most
// dimension, so need to stride by the
// total number of grid reductions. Could
// come back and change it so this is the
// contiguous dimension
reduction_ops...);
}
// Which block reduction this thread is participating in
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
impl::blockReduceEach<
BROADCAST,
false,
Aligned,
LocalTuple<DataTypes...>,
BlockDimT,
Funcs...>(
last_block_result,
last_block_result,
shared_mem,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
min(grid_red_size, block_reduction_size),
block_reduction_idx,
block_dim,
reduction_ops...);
copyTupleIf(
out,
last_block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
}
}
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
//! Implementation helper for welfordEach.
template <int ValIdx, typename Triplet0, typename Triplet1>
struct WelfordForEach {
static __inline__ __device__ void call(
Triplet0& triplet0,
nvfuser_index_t offset0,
const Triplet1& triplet1,
nvfuser_index_t offset1) {
static_assert(
Triplet0::num_vals == Triplet1::num_vals, "Invalid Triplet types");
static_assert(
IsSameType<typename Triplet0::DataType, typename Triplet1::DataType>::
value,
"Invalid Triplet types");
static_assert(
IsSameType<typename Triplet0::IndexType, typename Triplet1::IndexType>::
value,
"Invalid Triplet types");
using DataType = typename Triplet0::DataType;
using IndexType = typename Triplet0::IndexType;
WelfordForEach<ValIdx - 1, Triplet0, Triplet1>::call(
triplet0, offset0, triplet1, offset1);
welfordCombine<DataType, IndexType>(
triplet0.avg.val<ValIdx>(offset0),
triplet0.var.val<ValIdx>(offset0),
triplet0.N.val<ValIdx>(offset0),
triplet1.avg.val<ValIdx>(offset1),
triplet1.var.val<ValIdx>(offset1),
triplet1.N.val<ValIdx>(offset1));
}
};
template <typename Triplet0, typename Triplet1>
struct WelfordForEach<-1, Triplet0, Triplet1> {
__inline__ __device__ static void call(
Triplet0& triplet0,
nvfuser_index_t offset0,
const Triplet1& triplet1,
nvfuser_index_t offset1) {}
};
//! Call welfordCombine with each of the triplet tuples. This is a
//! welford version of reduceEach.
template <typename Triplet0, typename Triplet1>
__inline__ __device__ static void welfordEach(
Triplet0& triplet0,
nvfuser_index_t offset0,
const Triplet1& triplet1,
nvfuser_index_t offset1) {
WelfordForEach<Triplet0::num_vals - 1, Triplet0, Triplet1>::call(
triplet0, offset0, triplet1, offset1);
}
// Welford version of BlockReduceEach
template <
int idx,
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
struct BlockWelfordEach {
__inline__ __device__ static void reduce(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>&
partial_result,
PtrTuple<DataType, DataType, IndexType> shared_buf,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
// Finish the reduction of each tuple value with a smaller offset
BlockWelfordEach<
idx - 1,
BROADCAST,
true,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT>::
reduce(
block_result,
partial_result,
shared_buf,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim);
if (num_elements_per_reduction == 1) {
if (has_block_result) {
copyWelfordTripletTuple(block_result, partial_result);
}
return;
}
LocalTuple<DataType, DataType, IndexType> block_result_i(
partial_result.avg.val<idx>(0),
partial_result.var.val<idx>(0),
partial_result.N.val<idx>(0));
const auto smem_offset =
reduction_idx * num_threads_per_reduction + tid_in_reduction;
const int np2 = 1 << (31 - __clz(num_elements_per_reduction));
// Threads values are initialized, so all can participate here
if (tid_in_reduction >= np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
block_sync::sync<Aligned>(block_dim);
if (tid_in_reduction < np2 &&
tid_in_reduction + np2 < num_elements_per_reduction) {
impl::reduceTuple(
block_result_i,
0,
shared_buf,
smem_offset + np2,
welfordCombine<DataType, IndexType>);
}
if (tid_in_reduction < np2) {
copyTuple(shared_buf, smem_offset, block_result_i);
}
// Always sync when communicating across smem
block_sync::sync<Aligned>(block_dim);
// Reduce down to 2 values, last thread will do the final reduction and
// can save a syncthreads this way
for (int factor = np2 / 2; factor > 1; factor >>= 1) {
if (tid_in_reduction < factor) {
impl::reduceTuple(
shared_buf,
smem_offset,
shared_buf,
smem_offset + factor,
welfordCombine<DataType, IndexType>);
}
block_sync::sync<Aligned>(block_dim);
}
copyTuple(block_result_i, shared_buf, smem_offset);
// Do the last reduction
if (has_block_result) {
impl::reduceTuple(
block_result_i,
0,
shared_buf,
smem_offset + 1,
welfordCombine<DataType, IndexType>);
}
if (BROADCAST) {
if (has_block_result) {
// Put result back in shared memory, put in the first entry of the
// reduction segment's buffer
copyTuple(
shared_buf,
reduction_idx * num_threads_per_reduction,
block_result_i);
}
// Sync threads to make sure result is in smem
block_sync::sync<Aligned>(block_dim);
copyTuple(
block_result_i,
shared_buf,
reduction_idx * num_threads_per_reduction);
}
block_result.avg.val<idx>(0) = block_result_i.val<0>(0);
block_result.var.val<idx>(0) = block_result_i.val<1>(0);
block_result.N.val<idx>(0) = block_result_i.val<2>(0);
if (FORWARD_PROTECT_SMEM) {
block_sync::sync<Aligned>(block_dim);
}
}
};
// Specialization for idx == -1, i.e., no value to reduce.
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
struct BlockWelfordEach<
-1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT> {
__inline__ __device__ static void reduce(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>&
partial_result,
PtrTuple<DataType, DataType, IndexType> shared_buf,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {}
};
//! Welford version of blockReduceEach. Perform block-parallel Welford
//! reduction of each Welford triplet.
template <
bool BROADCAST,
bool FORWARD_PROTECT_SMEM,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__inline__ __device__ void blockWelfordEach(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>&
partial_result,
PtrTuple<DataType, DataType, IndexType> shared_buf,
bool has_block_result,
int tid_in_reduction,
int num_threads_per_reduction,
int num_elements_per_reduction,
int reduction_idx,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim) {
BlockWelfordEach<
NumVals - 1,
BROADCAST,
FORWARD_PROTECT_SMEM,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT>::
reduce(
block_result,
partial_result,
shared_buf,
has_block_result,
tid_in_reduction,
num_threads_per_reduction,
num_elements_per_reduction,
reduction_idx,
block_dim);
}
} // namespace impl
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds) {
const ConstRefWelfordTripletTuple<NumArgs, DataType, IndexType> inp(
inp_avg, inp_var, inp_N);
RefWelfordTripletTuple<NumArgs, DataType, IndexType> out(
out_avg, out_var, out_N);
// If no reduction needed, just return input
if (!BLOCK_REDUCE && !GRID_REDUCE) {
copyWelfordTripletTupleIf(out, inp, read_preds && write_preds);
return;
}
// Don't read/write in temporary buffers if in a predicated dimension
const bool block_reduce_participate = index_utils::
maskedIsZero<isPred(X_THREAD), isPred(Y_THREAD), isPred(Z_THREAD)>(
threadIdx);
// Only threads that with id == 0 in the dimensions being reduced will
// have a valid result
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
LocalWelfordTripletTuple<NumArgs, DataType, IndexType> block_result(
init_avg, init_var, init_N);
// Initial per-block reduction. Result is broadcast if specified
// and this call is block reduction only.
welfordGroupBlock<
!GRID_REDUCE && BROADCAST,
Aligned,
NumArgs,
DataType,
IndexType>(
block_result,
inp,
block_dim,
shared_buf,
read_preds,
block_reduce_participate);
// If block reduction only, save to out and exit
if (!GRID_REDUCE) {
copyWelfordTripletTupleIf(
out,
block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
// Need a block sync here as reduceGroupBlock does not
// forward-protect the smem buffer. This block sync is not
// necessary when a grid reduction follows since a block sync is
// done just before the grid sync.
block_sync::sync<Aligned>(block_dim);
return;
}
// -- START GRID REDUCTION -- //
// Grid reductions are more challenging for two reasons, (1) the reduction
// itself is 3D instead of 2D because we now have an iter domain space in
// the grid dimension. (2) a tree reduction isn't performed, instead all
// blocks will populate GMEM and one block will finish the grid reduction.
// What is the grid reduction size, block reduction already performed so
// that doesn't have to be taken into consideration
const auto grid_red_size = index_utils::
maskedSize<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
gridDim);
// Which ID in the reduction is this block. Threads can participate in
// multiple grid reductions, but the block will have the same relative index
// in those reductions
const auto idx_in_grid_red = index_utils::
maskedOffset<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the grid dimension
const auto num_block_iters = index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(gridDim);
// Which grid reduction does this block participate in, in the grid
// dimension
const auto block_red_idx_offset = index_utils::
maskedOffset<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
blockIdx, gridDim);
// How many grid reductions have to be performed, in the block dimension
const auto num_thread_iters = index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim);
// Which grid reduction does this thread participate in, in the block
// dimension
const auto thread_red_idx_offset = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Offset into the work buffer
auto work_buf_offset =
(idx_in_grid_red * num_block_iters + block_red_idx_offset) *
num_thread_iters +
thread_red_idx_offset;
// Don't read/write in temporary buffers if in a predicated dimension
bool grid_reduce_participate = index_utils::
maskedIsZero<isPred(X_BLOCK), isPred(Y_BLOCK), isPred(Z_BLOCK)>(blockIdx);
VolatilePtrWelfordTripletTuple<NumArgs, DataType, IndexType>
global_work_buffer(
global_work_buffer_avg, global_work_buffer_var, global_work_buffer_N);
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size =
index_utils::
maskedSize<isIter(X_BLOCK), isIter(Y_BLOCK), isIter(Z_BLOCK)>(
gridDim) *
index_utils::
maskedSize<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
block_dim) *
grid_red_size;
global_work_buffer += global_buffer_size;
}
flip = !flip;
// Per-block partial reduction to global work buffer
if (grid_reduce_participate && block_reduce_participate && has_block_result) {
copyWelfordTripletTuple(global_work_buffer, work_buf_offset, block_result);
}
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if (grid_reduce_participate) {
// Don't need to sync up blocks that are not participating in this
// reduction
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[block_red_idx_offset],
grid_red_size,
last_block,
block_dim);
}
// -- START BLOCK CLEANUP -- //
welfordGroupLastBlock<Aligned, NumArgs, DataType, IndexType>(
out,
global_work_buffer,
LocalWelfordTripletTuple<NumArgs, DataType, IndexType>(
init_avg, init_var, init_N),
block_dim,
shared_buf,
block_red_idx_offset,
num_thread_iters,
num_block_iters,
thread_red_idx_offset,
grid_red_size,
write_preds,
block_reduce_participate,
grid_reduce_participate);
// Forward protect the smem buffer
block_sync::sync<Aligned>(block_dim);
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumArgs,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroup(
typename MakeRefTuple<NumArgs, DataType>::type out_avg,
typename MakeRefTuple<NumArgs, DataType>::type out_var,
typename MakeRefTuple<NumArgs, IndexType>::type out_N,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_avg,
const typename MakeConstRefTuple<NumArgs, DataType>::type& inp_var,
const typename MakeConstRefTuple<NumArgs, IndexType>::type& inp_N,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_avg,
const typename MakeLocalTuple<NumArgs, DataType>::type& init_var,
const typename MakeLocalTuple<NumArgs, IndexType>::type& init_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_avg,
typename MakeVolatilePtrTuple<NumArgs, DataType>::type
global_work_buffer_var,
typename MakeVolatilePtrTuple<NumArgs, IndexType>::type
global_work_buffer_N,
int64_t* global_sync_buffer,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumArgs, bool>::type& read_preds,
const typename MakeLocalTuple<NumArgs, bool>::type& write_preds,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
welfordGroup<Aligned, NumArgs, DataType, IndexType>(
out_avg,
out_var,
out_N,
inp_avg,
inp_var,
inp_N,
init_avg,
init_var,
init_N,
block_dim,
global_work_buffer_avg,
global_work_buffer_var,
global_work_buffer_N,
global_sync_buffer,
shared_buf,
read_preds,
write_preds);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool BLOCK_BROADCAST,
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupBlock(
LocalWelfordTripletTuple<NumVals, DataType, IndexType>& block_result,
const ConstRefWelfordTripletTuple<NumVals, DataType, IndexType>& inp,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
const typename MakeLocalTuple<NumVals, bool>::type& read_preds,
bool block_reduce_participate) {
const bool has_block_result = index_utils::
maskedIsZero<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx);
copyWelfordTripletTupleIf(
block_result, inp, block_reduce_participate && read_preds);
// Size of the block reduction segment, can be an int since it's limited
// to number of threads
const int block_reduction_size = index_utils::
maskedSize<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
block_dim);
// Index in the reduction segment, can be an int since it's limited to
// number of threads
const int tid_in_block_reduction = index_utils::
maskedOffset<isReduce(X_THREAD), isReduce(Y_THREAD), isReduce(Z_THREAD)>(
threadIdx, block_dim);
// ID of the block reduction this thread is participating in
//
// If any of the parallel dimensions are predicated out, that means
// they've already been reduced, so we only care about the first thread in
// that dimension. Therefore don't expand the reduction_idx by that
// dimension
const int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
// Do not protect the smem buffer as it's not always necessary.
impl::blockWelfordEach<
BLOCK_BROADCAST,
false,
Aligned,
NumVals,
DataType,
IndexType,
BlockDimT>(
block_result,
block_result,
shared_buf,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
block_reduction_size,
block_reduction_idx,
block_dim);
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumVals,
typename DataType,
typename IndexType,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupLastBlock(
RefWelfordTripletTuple<NumVals, DataType, IndexType>& out,
const VolatilePtrWelfordTripletTuple<NumVals, DataType, IndexType>&
global_work_buffer,
const LocalWelfordTripletTuple<NumVals, DataType, IndexType>& init_val,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
PtrTuple<DataType, DataType, IndexType> shared_buf,
nvfuser_index_t block_red_idx_offset,
nvfuser_index_t num_thread_iters,
nvfuser_index_t num_block_iters,
nvfuser_index_t thread_red_idx_offset,
nvfuser_index_t grid_red_size,
const typename MakeLocalTuple<NumVals, bool>::type& write_preds,
bool block_reduce_participate,
bool grid_reduce_participate) {
// Initialize block result
auto last_block_result = init_val;
const bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
if ((PERSISTENT_REDUCTION || last_block) && grid_reduce_participate) {
// Can use the last block to reduce all the values the blocks filled in.
// Can use any thread that has been predicated, or has been reduced to do
// this reduction, cannot use any block that's associated with an
// iteration domain
// Start with non-block reduction
// Index in the reduction segment
int tid_in_block_reduction = index_utils::maskedOffset<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx, block_dim);
int block_reduction_size = index_utils::maskedSize<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(block_dim);
bool has_block_result = index_utils::maskedIsZero<
activeNotIter(X_THREAD),
activeNotIter(Y_THREAD),
activeNotIter(Z_THREAD)>(threadIdx);
// 3D buffer of reductions:
// [reduction_offset(grid), iter_offset(grid), iter_offset(block)]
// Change the offset, we want to keep the last two dimensions, but the
// first dimension is what we will reduce over
const auto work_buf_offset =
block_red_idx_offset * num_thread_iters + thread_red_idx_offset;
for (auto reduction_i = tid_in_block_reduction; reduction_i < grid_red_size;
reduction_i += block_reduction_size) {
impl::welfordEach(
last_block_result,
0,
global_work_buffer,
work_buf_offset + reduction_i * num_block_iters * num_thread_iters);
}
// Which block reduction this thread is participating in
int block_reduction_idx = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
impl::blockWelfordEach<
BROADCAST,
false,
Aligned,
NumVals,
DataType,
IndexType>(
last_block_result,
last_block_result,
shared_buf,
has_block_result,
tid_in_block_reduction,
block_reduction_size,
min(grid_red_size, block_reduction_size),
block_reduction_idx,
block_dim);
copyWelfordTripletTupleIf(
out,
last_block_result,
write_preds &&
(block_reduce_participate && (BROADCAST || has_block_result)));
}
}
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
// Grouped block welford optimized for outer reductions with
// TIDx and TIDy mapped to non-reduction and reduction domains,
// respectively with unused TIDz.
//
// The main motivation of this optimized version is the same as the
// grouped grid reduction, i.e, by doing multiple reductions together,
// it is possible to reduce the number of synchronizations. However,
// unlike the grouped grid reduction, the cost of grouping can be
// prohitively high, i.e., the size of the work buffer must be
// expanded by a factor of grouping. In the case of grid
// reductions, the buffer is on global memory, so the space requirement
// is not a concern, but that isn't the case with block reductions,
// since the buffer is on shared memory, which has a limited
// capacity.
//
// This implementation tries to benefit from aggregated block
// synchronizations while minimizing the cost of the expanded buffer
// size by first partially reducing the input within each warp. It
// would save the required buffer size by a factor of WARP_SIZE /
// blockDim.x as the reduction is done along threadIdx.y. So to be
// effective, blockDim.x needs to be smaller than WARP_SIZE, and in the
// case of grouped grid welford, it should be typically 8 or 16.
//
// The algorithm is an adaptation of scattered butterfly reduction,
// aka recursive halving, commonly used for implementing
// MPI_Reduce_scatter. For a visual illustration of the data
// organization, see, for example, page 22 of Solomonik,
// Design of Parallel and High-Performance Computing:
// Distributed-Memory Models and Algorithms, 2015
// (https://solomonik.cs.illinois.edu/talks/dphpc-dec-2015.pdf)
//
// Assumptions:
// - blockDim.x and blockDim.y are statically known values so that all
// loops can be completely unrolled
// - blockDim.x is smaller than WARP_SIZE
// - blockDim.x evenly divides WARP_SIZE
// - There are multiple warps per block
// - The gouping factor, NumVals, is at least as large as the warp
// dimY and is divisible by the warp dimY.
//
// This is meant to be used as part of the grouped grid welford
// reduction but should be usable as a standalone block welford routine as
// long as the above assumptions hold.
//
// Note: Having an output reference parameter resulted in using more
// registers than just returing the output. Results would vary
// depending on compiler versions, but it seems safer to return outputs
// as a new value.
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__inline__ __device__ WelfordTriplet<DataType> blockWelfordOuter(
DataType* inp_avg,
DataType* inp_var,
nvfuser_index_t inp_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* smem) {
constexpr int num_warps = BDIMX * BDIMY / 32;
static_assert(num_warps >= 1, "There must be at least a single warp");
static_assert(32 % BDIMX == 0, "blockDimx.x must be able to divide 32");
const int tid = threadIdx.x + threadIdx.y * BDIMX;
const int wid = tid / 32;
// Dimension of the Y axis within each warp
constexpr int wdimy = 32 / BDIMX;
static_assert(NumVals >= wdimy, "NumVals must be >= 32 / blockDim.x");
static_assert(
NumVals % wdimy == 0, "NumVals must be divisible by 32 / blockDim.x");
// There must be at least a single warp
// Y index within each warp
const int warp_tidy = threadIdx.y % wdimy;
// Thread index in each warp
const int lane_id = threadIdx.x + warp_tidy * BDIMX;
constexpr int smem_var_offset = num_warps * BDIMX * NumVals;
constexpr int smem_N_offset = num_warps * BDIMX * NumVals * 2;
// We define a chunk as a value in a group and a chunk size as the
// number of group values per thread. Initially, the chunk size is
// NumVals. After the initial warp reduction, the chunk size is
// reduced to NumVals/wdimy. For example, suppose NumVals=8,
// blockDim.x=8, blockDim.y=32, then wdimy=4, so after the initial
// warp reduction, the chunk size is 2. This is the number of
// elements each thread stores to shared memory.
int chunk_size = NumVals;
// Butterfly reduction, a.k.a. recursive halving as each iteration
// halves the number of values
#pragma unroll
for (int lane_mask = 16; lane_mask >= BDIMX; lane_mask /= 2) {
chunk_size /= 2;
const auto peer_N = __shfl_xor_sync(0xffffffff, inp_N, lane_mask);
const auto updated_N = inp_N + peer_N;
const DataType b_N_div_ab_N =
updated_N != 0 ? ((DataType)peer_N) / ((DataType)updated_N) : 0;
#pragma unroll
for (int index_in_chunk = 0; index_in_chunk < chunk_size;
++index_in_chunk) {
DataType pushed_avg = 0;
DataType pushed_var = 0;
DataType self_avg = 0;
DataType self_var = 0;
// Divergent branch. Not a big deal with independent scheduling?
if (lane_id & lane_mask) {
// Push first half
auto push_offset = index_in_chunk;
auto self_offset = index_in_chunk + chunk_size;
pushed_avg = inp_avg[push_offset];
pushed_var = inp_var[push_offset];
self_avg = inp_avg[self_offset];
self_var = inp_var[self_offset];
} else {
// Push second half
auto push_offset = index_in_chunk + chunk_size;
auto self_offset = index_in_chunk;
pushed_avg = inp_avg[push_offset];
pushed_var = inp_var[push_offset];
self_avg = inp_avg[self_offset];
self_var = inp_var[self_offset];
}
auto peer_avg = __shfl_xor_sync(0xffffffff, pushed_avg, lane_mask);
auto peer_var = __shfl_xor_sync(0xffffffff, pushed_var, lane_mask);
auto delta = peer_avg - self_avg;
self_avg += delta * b_N_div_ab_N;
self_var += peer_var + delta * delta * ((DataType)(inp_N)) * b_N_div_ab_N;
inp_avg[index_in_chunk] = self_avg;
inp_var[index_in_chunk] = self_var;
}
inp_N = updated_N;
}
// At this point, chunk_size is reduced to NumVals/wdimy as
// mentioned above. Each thread has warp-reduced chunk_size values
// in array inp. This chunk_size_post_reduction should be equal to
// chunk_size at this point.
constexpr int chunk_size_post_reduction = NumVals / wdimy;
// More specifically, the warp_tidy of each thread defines
// the chunk IDs held by the thread as follows:
//
// [warp_tidy * chunk_size_post_reduction, warp_tidy *
// chunk_size_post_reduction + chunk_size_post_reduction]
//
// Each thread uploads the chunk_size_post_reduction values one by
// one. Each chunk is spread by BDIMX * BDIMY values. The data
// layout of the shared memory is:
//
// [chunk_size, wid, warp_tidy, TIDx]
//
// The remaining reduction is done on the WID
// dimension. More specifically, we assign one warp per chunk (or
// a value of the group). The wdimy threads of the same threadId.x
// collectively reduce num_warps partial results, each of which is
// stored with stride 32. This means that there will be wdimy-way
// bank conflicts, so to avoid that, swizzling is also employed.
#pragma unroll
for (int i = 0; i < chunk_size; ++i) {
// Accumulating smem offset from the innermost dimension
int smem_offset = 0;
// TIDx
smem_offset += threadIdx.x;
// Warp_TIDy with swizzle
smem_offset += ((warp_tidy + wid) % wdimy) * BDIMX;
// WID
smem_offset += wid * 32;
// chunk_size
smem_offset += i * BDIMX * BDIMY;
smem[smem_offset] = inp_avg[i];
smem[smem_var_offset + smem_offset] = inp_var[i];
// Upload N only when threadIdx.x == 0 && chunk_index == 0
if (threadIdx.x == 0 && i == 0 && warp_tidy == 0) {
reinterpret_cast<nvfuser_index_t*>(smem + smem_N_offset)[wid] = inp_N;
}
}
block_sync::sync<Aligned>(block_dim);
// The next step is to let each thread of a warp independently
// accumulate the partial results on the shared memory
// reduction. A single warp is used to accumulate of the partial
// results for a single chunk, so warp wid takes care of the wid-th
// chunk.
//
// The starting offset of partial results of a chunk is:
//
// (wid % chunk_size_post_reduction) * BDIMX * BDIMY + (wid /
// chunk_size_post_reduction) * BDIMX
//
// Note that each thread had chunk_size_post_reduction contiguous
// chunks, so when uploaded to shmem, they are strided by
// BDIMX*BDIMY, hence (wid % chunk_size_post_reduction) * BDIMX *
// BDIMY.
// The vector width is likely at least 4, so at least 4 warps should
// be used, which is
// enough to occupy an SM. When NumVals=8, it might be more
// efficient to use just 4 warps with each warp taking care of two
// groups, but the difference would be pretty small.
// Also, the number of warps should be at least 8 and can be 16
// too. NumVals should be 8 at largest, so it's always num_warps >=
// NumVals.
DataType avg = 0;
DataType var = 0;
nvfuser_index_t N = 0;
static_assert(
num_warps >= NumVals,
"Number of warps must be at least as large as NumVals");
if (wid < NumVals) {
#pragma unroll
for (int i = warp_tidy; i < num_warps; i += wdimy) {
int offset = 0;
offset += threadIdx.x;
// Offset to the partial results of the i-th warp
offset += i * 32;
// Offset to the chunk for this warp. Swizzled to avoid bank
// conflicts.
offset += ((wid / chunk_size + i) % wdimy) * BDIMX;
offset += (wid % chunk_size) * BDIMX * BDIMY;
DataType avg_smem = smem[offset];
DataType var_smem = smem[smem_var_offset + offset];
nvfuser_index_t N_smem =
reinterpret_cast<nvfuser_index_t*>(&smem[smem_N_offset])[i];
welfordCombine(avg, var, N, avg_smem, var_smem, N_smem);
}
}
block_sync::sync<Aligned>(block_dim);
// Nothing to do for warps whose wid is larger than NunVals
if (wid >= NumVals) {
WelfordTriplet<DataType> out = {0, 0, 0};
return out;
}
// Standard binary-exchange reduction within wdimy intra-warp
// threads.
#pragma unroll
for (int lane_mask = 16; lane_mask >= BDIMX; lane_mask /= 2) {
auto avg_peer = __shfl_xor_sync(0xffffffff, avg, lane_mask);
auto var_peer = __shfl_xor_sync(0xffffffff, var, lane_mask);
auto N_peer = __shfl_xor_sync(0xffffffff, N, lane_mask);
welfordCombine(avg, var, N, avg_peer, var_peer, N_peer);
}
WelfordTriplet<DataType> out = {avg, var, N};
return out;
}
} // namespace impl
} // namespace fused_reduction
// clang-format off
/*
* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
* All rights reserved.
* SPDX-License-Identifier: BSD-3-Clause
*/
// clang-format on
namespace fused_reduction {
namespace impl {
// Utility struct to hold multiple values for grouped Welford. The
// count is uniform, so there's only one N value.
template <int NumVals, typename DataType>
struct WelfordTripletVector {
Array<DataType, NumVals, NumVals> avg_;
Array<DataType, NumVals, NumVals> var_;
nvfuser_index_t N_;
WelfordTripletVector() = default;
__device__ WelfordTripletVector(
const DataType avg[NumVals],
const DataType var[NumVals],
const nvfuser_index_t N) {
memcpy(avg_.array, avg, sizeof(DataType) * NumVals);
memcpy(var_.array, var, sizeof(DataType) * NumVals);
N_ = N;
}
__device__ WelfordTripletVector& operator=(
const WelfordTripletVector<NumVals, DataType>& other) {
avg_ = other.avg_;
var_ = other.var_;
N_ = other.N_;
return *this;
}
__device__ void init() {
avg_.set((DataType)0);
var_.set((DataType)0);
N_ = 0;
}
__device__ DataType& avg(int idx) {
return avg_[idx];
}
__device__ DataType avg(int idx) const {
return avg_.array[idx];
}
__device__ DataType& var(int idx) {
return var_[idx];
}
__device__ DataType var(int idx) const {
return var_.array[idx];
}
__device__ nvfuser_index_t& N() {
return N_;
}
__device__ nvfuser_index_t N() const {
return N_;
}
};
// The offset in smem buffer to broadcast final results within a
// thread block
template <int BDIMX>
__inline__ __device__ int getSmemGroupOffset(int iter_idx, int group_idx) {
return group_idx * BDIMX + iter_idx;
}
// Upload the final results to smem for intra-block broadcasting
template <int NumVals, typename DataType, int BDIMX, int BDIMY>
__inline__ __device__ void copyFromTripletToSmem(
DataType* smem,
int iter_idx,
int group_idx,
const WelfordTriplet<DataType>& local_triplet) {
int offset = getSmemGroupOffset<BDIMX>(iter_idx, group_idx);
smem[offset] = local_triplet.avg;
int smem_stride = BDIMX * NumVals;
smem[smem_stride + offset] = local_triplet.var;
if (iter_idx == 0 && group_idx == 0) {
reinterpret_cast<nvfuser_index_t*>(smem + smem_stride * 2)[0] =
local_triplet.N;
}
}
// Fetch the final results from smem for intra-block broadcasting
template <int NumVals, typename DataType, int BDIMX, int BDIMY>
__inline__ __device__ void copyFromSmemToTriplet(
WelfordTriplet<DataType>& local_triplet,
const DataType* smem,
int iter_idx,
int group_idx) {
int offset = getSmemGroupOffset<BDIMX>(iter_idx, group_idx);
local_triplet.avg = smem[offset];
int smem_stride = BDIMX * NumVals;
local_triplet.var = smem[smem_stride + offset];
local_triplet.N =
reinterpret_cast<const nvfuser_index_t*>(smem + smem_stride * 2)[0];
}
// Per-thread accumulation of the per-block partial results in global
// memory. There's gridDim.y partial results, which is accumulated in
// parallel by threadIdx.y. This should be followed by a block reduction.
template <int NumVals, typename DataType, int BDIMX, int BDIMY>
__device__ __inline__ WelfordTripletVector<NumVals, DataType>
welfordGroupAccumulateGlobalBuffer(
volatile DataType* global_buf_avg,
volatile DataType* global_buf_var,
volatile nvfuser_index_t* global_buf_N,
bool flip) {
const int grid_size = gridDim.x * gridDim.y;
const int iter_idx = threadIdx.x;
const int red_idx = threadIdx.y;
const int num_threads_per_reduction = BDIMY;
WelfordTripletVector<NumVals, DataType> results;
results.init();
// Reduction is done cooperatively with the thread blocks with the
// same blockIdx.x. Thread blocks with the same blockIdx.x uses a
// global buffer of size blockDim.x * gridDim.y for each value in a
// group.
// Advance the global buffer pointers to the location of the values
// to accumulate for the first group value (i.e., gi == 0 in the
// below NumVals loop)
global_buf_avg += iter_idx + blockIdx.x * BDIMX * gridDim.y;
global_buf_var += iter_idx + blockIdx.x * BDIMX * gridDim.y;
global_buf_N += iter_idx + blockIdx.x * BDIMX * gridDim.y;
if (flip) {
global_buf_avg += BDIMX * grid_size * NumVals;
global_buf_var += BDIMX * grid_size * NumVals;
global_buf_N += BDIMX * grid_size * NumVals;
}
// Since there's gridDim.y elements to reduce using blockDim.y
// threads, loop over gridDim.y with stride blockDim.y. First, just
// grab the values in the global memory.
if (red_idx < gridDim.y) {
int work_buf_offset = red_idx * BDIMX;
// N is constant across NumVals
const auto g_N = global_buf_N[work_buf_offset];
results.N() = g_N;
// Just copy the first elements
#pragma unroll
for (int gi = 0; gi < NumVals; ++gi) {
auto& a_avg = results.avg(gi);
auto& a_var = results.var(gi);
auto b_avg = global_buf_avg[work_buf_offset];
auto b_var = global_buf_var[work_buf_offset];
work_buf_offset += grid_size * BDIMX;
results.avg(gi) = b_avg;
results.var(gi) = b_var;
}
}
// Accumulate into results by looping over the remaining results in
// the global buffer
for (int ri = red_idx + num_threads_per_reduction; ri < gridDim.y;
ri += num_threads_per_reduction) {
int work_buf_offset = ri * BDIMX;
// N is constant across NumVals
const auto g_N = global_buf_N[work_buf_offset];
nvfuser_index_t updated_N = results.N() + g_N;
// Hoist the division by updated_N as it's invariant over the
// NumVals loop
DataType b_N_div_ab_N = updated_N != 0
? (((DataType)g_N) / ((DataType)updated_N))
: (DataType)0;
DataType a_N_b_N_div_ab_N = ((DataType)results.N()) * b_N_div_ab_N;
#pragma unroll
for (int gi = 0; gi < NumVals; ++gi) {
auto& a_avg = results.avg(gi);
auto& a_var = results.var(gi);
auto b_avg = global_buf_avg[work_buf_offset];
auto b_var = global_buf_var[work_buf_offset];
work_buf_offset += grid_size * BDIMX;
auto delta = b_avg - a_avg;
a_avg += delta * b_N_div_ab_N;
a_var += b_var + delta * delta * a_N_b_N_div_ab_N;
}
results.N() = updated_N;
}
return results;
}
} // namespace impl
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer) {
using namespace fused_reduction::impl;
static_assert(
isIter(X_BLOCK) && isReduce(Y_BLOCK) && inactive(Z_BLOCK) &&
isIter(X_THREAD) && isReduce(Y_THREAD) && inactive(Z_THREAD),
"Invalid parallelization for outer welford reduction");
static_assert(
BDIMY % NumVals == 0, "blockDim.y must be divisible by group count");
static_assert(BDIMX <= 32, "blockDim.x must be up to 32.");
static_assert(
(BDIMX * BDIMY) % 32 == 0, "Number of threads must be a multiple of 32.");
static_assert(32 % BDIMX == 0, "blockDim.x must be able to divide 32.");
static_assert(
NumVals >= (32 / BDIMX), "Group count must be >= 32 / blockDim.x");
#pragma unroll
for (int i = 0; i < NumVals; ++i) {
out_avg[i] = in_avg[i];
out_var[i] = in_var[i];
}
auto iter_tid = index_utils::
maskedOffset<isIter(X_THREAD), isIter(Y_THREAD), isIter(Z_THREAD)>(
threadIdx, block_dim);
auto per_block_result =
impl::blockWelfordOuter<Aligned, NumVals, DataType, BDIMX, BDIMY>(
out_avg, out_var, in_N, block_dim, shared_buf);
// At this point, threads with tid_in_group == 0 has valid partial
// results. Store them to global buffer.
const int grid_size = gridDim.x * gridDim.y;
const int iter_idx = threadIdx.x;
// Stores the partial results into the global work buffer. Only
// threads with tid_in_group have the valid partial results
const int wid = (threadIdx.x + threadIdx.y * BDIMX) / 32;
constexpr int wdimy = 32 / BDIMX;
const int warp_tidy = threadIdx.y % wdimy;
const bool has_valid_block_reduction_result = warp_tidy == 0 && wid < NumVals;
// Each valid result is held by a warp
const int valid_group_idx = wid;
if (has_valid_block_reduction_result) {
int work_buf_offset = iter_idx + blockIdx.y * BDIMX +
blockIdx.x * BDIMX * gridDim.y + valid_group_idx * BDIMX * grid_size;
if (PERSISTENT_REDUCTION && flip) {
auto global_buffer_size = BDIMX * grid_size * NumVals;
work_buf_offset += global_buffer_size;
}
global_buf_avg[work_buf_offset] = per_block_result.avg;
global_buf_var[work_buf_offset] = per_block_result.var;
// the count values should be the same across the group, so just
// store once
if (valid_group_idx == 0) {
global_buf_N[work_buf_offset] = per_block_result.N;
}
}
flip = !flip;
// -- GLOBAL BUFFER FILLED -- //
bool last_block = index_utils::
maskedIsLast<isReduce(X_BLOCK), isReduce(Y_BLOCK), isReduce(Z_BLOCK)>(
blockIdx, gridDim);
grid_sync::sync<
isReduce(X_BLOCK),
isReduce(Y_BLOCK),
isReduce(Z_BLOCK),
PERSISTENT_REDUCTION,
Aligned>(
global_sync_buffer[blockIdx.x], gridDim.y, last_block, block_dim);
auto partial_results =
welfordGroupAccumulateGlobalBuffer<NumVals, DataType, BDIMX, BDIMY>(
global_buf_avg, global_buf_var, global_buf_N, !flip);
auto per_block_final_result =
impl::blockWelfordOuter<Aligned, NumVals, DataType, BDIMX, BDIMY>(
partial_results.avg_.array,
partial_results.var_.array,
partial_results.N_,
block_dim,
shared_buf);
// At this point, each thread of the groups with tid_in_group=0
// has the final reduction result. We need to upload them to
// shmem for broadcasting.
if (has_valid_block_reduction_result) {
copyFromTripletToSmem<NumVals, DataType, BDIMX, BDIMY>(
shared_buf, iter_idx, valid_group_idx, per_block_final_result);
}
__syncthreads();
#pragma unroll
for (int i = 0; i < NumVals; ++i) {
WelfordTriplet<DataType> final_result;
copyFromSmemToTriplet<NumVals, DataType, BDIMX, BDIMY>(
final_result, shared_buf, iter_idx, i);
out_avg[i] = final_result.avg;
out_var[i] = final_result.var;
in_N = final_result.N;
}
#pragma unroll
for (int i = 0; i < NumVals; ++i) {
out_N[i] = in_N;
}
// Forward protect the smem buffer
__syncthreads();
}
template <
int X_BLOCK,
int Y_BLOCK,
int Z_BLOCK,
int X_THREAD,
int Y_THREAD,
int Z_THREAD,
bool PERSISTENT_REDUCTION,
bool BROADCAST>
template <
bool Aligned,
int NumVals,
typename DataType,
int BDIMX,
int BDIMY,
typename BlockDimT>
__device__ __inline__ void ParallelReduce<
X_BLOCK,
Y_BLOCK,
Z_BLOCK,
X_THREAD,
Y_THREAD,
Z_THREAD,
PERSISTENT_REDUCTION,
BROADCAST>::
welfordGroupOuter(
DataType out_avg[NumVals],
DataType out_var[NumVals],
nvfuser_index_t out_N[NumVals],
const DataType in_avg[NumVals],
const DataType in_var[NumVals],
nvfuser_index_t in_N,
// block_dim is basically just blockDim (wrapped as DefaultBlockDim) if
// there is no warp specialization in the kernel. If there is warp
// specialization, block_dim is the the dimension of the compute warps.
BlockDimT block_dim,
DataType* global_buf_avg,
DataType* global_buf_var,
nvfuser_index_t* global_buf_N,
DataType* shared_buf,
int64_t* global_sync_buffer,
int64_t& cycles,
int64_t& count) {
int64_t start_counter = 0;
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
start_counter = readCycleCounter();
}
welfordGroupOuter<Aligned, NumVals, DataType, BDIMX, BDIMY>(
out_avg,
out_var,
out_N,
in_avg,
in_var,
in_N,
block_dim,
global_buf_avg,
global_buf_var,
global_buf_N,
shared_buf,
global_sync_buffer);
if (isLastBlockInGrid() &&
index_utils::maskedIsZero<true, true, true>(threadIdx)) {
cycles += readCycleCounter() - start_counter;
++count;
}
}
} // namespace fused_reduction
// Codegen generated code
Test Diffs
Toggle All
1: CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_32
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 54
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103395arrayE[];
.entry _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<119>;
.reg .b32 %r<334>;
.reg .f64 %fd<302>;
.reg .b64 %rd<158>;
ld.param.v2.u32 {%r109, %r110}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r119, %r120}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r123, %r124}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r145, %r110, 1;
shr.u32 %r146, %r145, 31;
add.s32 %r147, %r145, %r146;
shr.s32 %r2, %r147, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r148, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r149, %r4, 3;
mad.lo.s32 %r150, %r149, %r148, 15;
and.b32 %r151, %r150, -16;
cvt.u64.u32 %rd1, %r151;
mul.lo.s32 %r152, %r4, %r2;
shl.b32 %r153, %r152, 4;
or.b32 %r154, %r153, 15;
and.b32 %r5, %r154, -16;
add.s32 %r155, %r154, %r5;
and.b32 %r156, %r155, -16;
cvt.s64.s32 %rd2, %r156;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
cvt.rn.f64.s32 %fd1, %r110;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r157, %r7, 1;
setp.lt.s32 %p10, %r157, %r110;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r158, smem_ptr; }
// end inline asm
shl.b32 %r161, %r6, 4;
add.s32 %r159, %r158, %r161;
mul.wide.s32 %rd47, %r7, 8;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r160, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r160, 0;
cp.async.ca.shared.global [%r159], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r162, %r4, 215;
div.s32 %r163, %r162, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r164, %r9, %r163;
add.s32 %r165, %r164, -1;
div.s32 %r10, %r165, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r167, %ctaid.y;
mul.lo.s32 %r168, %r10, %r4;
mul.lo.s32 %r11, %r168, %r167;
shl.b32 %r169, %r8, 3;
shl.b32 %r170, %r6, 4;
mad.lo.s32 %r12, %r169, %r110, %r170;
mul.lo.s32 %r171, %r110, %r8;
cvt.s64.s32 %rd52, %r171;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r172, %r11, %r110;
cvt.s64.s32 %rd6, %r172;
mul.lo.s32 %r13, %r110, %r4;
mul.lo.s32 %r14, %r10, %r167;
add.s32 %r15, %r171, %r7;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r15, 8;
add.s64 %rd7, %rd54, %rd55;
mov.u32 %r173, %tid.z;
mad.lo.s32 %r174, %r4, %r173, %r8;
mad.lo.s32 %r16, %r174, %r3, %r6;
mul.wide.u32 %rd56, %r16, 8;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r175, %r3;
mov.u32 %r176, 31;
sub.s32 %r177, %r176, %r175;
mov.u32 %r178, 1;
shl.b32 %r17, %r178, %r177;
setp.lt.u32 %p14, %r6, %r17;
add.s32 %r179, %r17, %r6;
setp.lt.u32 %p15, %r179, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r180, %r16, %r17;
mul.wide.s32 %rd57, %r180, 8;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r181, %r17, 31;
add.s32 %r182, %r17, %r181;
shr.s32 %r18, %r182, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r183, %r16, 1;
mul.wide.u32 %rd58, %r183, 8;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r174, 8;
add.s64 %rd13, %rd43, %rd61;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r316, 0;
mov.f64 %fd272, 0d0000000000000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r186, smem_ptr; }
// end inline asm
add.s32 %r187, %r12, %r186;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r196, smem_ptr; }
// end inline asm
add.s32 %r197, %r12, %r196;
not.pred %p26, %p3;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r184, %r316, %r4, %r8;
add.s32 %r185, %r184, %r11;
setp.gt.s32 %p17, %r185, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r189, %r13, %r316;
cvt.s64.s32 %rd65, %r189;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r188, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r188, 0;
cp.async.ca.shared.global [%r187], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r190, %r14, %r316;
mad.lo.s32 %r191, %r190, %r4, %r8;
setp.lt.s32 %p19, %r191, 216;
@%p19 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd268, 0d0000000000000000;
mov.f64 %fd269, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r192, %r14, %r316;
mad.lo.s32 %r193, %r192, %r4, %r8;
setp.gt.s32 %p20, %r193, 215;
@%p20 bra $L__BB0_14;
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
$L__BB0_14:
add.s32 %r194, %r14, %r316;
mad.lo.s32 %r22, %r194, %r4, %r8;
add.f64 %fd281, %fd281, %fd269;
add.f64 %fd280, %fd280, %fd268;
setp.gt.s32 %p21, %r22, 215;
mov.f64 %fd270, 0d0000000000000000;
@%p21 bra $L__BB0_16;
mul.lo.s32 %r195, %r22, %r119;
mul.wide.s32 %rd69, %r195, 8;
add.s64 %rd70, %rd16, %rd69;
ld.global.f64 %fd270, [%rd70];
$L__BB0_16:
setp.lt.s32 %p22, %r22, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_18;
mul.lo.s32 %r199, %r13, %r316;
cvt.s64.s32 %rd73, %r199;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r198, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r198, 0;
cp.async.ca.shared.global [%r197], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
add.s32 %r315, %r14, %r316;
mad.lo.s32 %r314, %r315, %r4, %r8;
setp.gt.s32 %p117, %r314, 215;
mov.f64 %fd274, 0d0000000000000000;
mov.f64 %fd271, %fd274;
@%p117 bra $L__BB0_20;
mul.lo.s32 %r200, %r22, %r123;
mul.wide.s32 %rd77, %r200, 8;
add.s64 %rd78, %rd17, %rd77;
ld.global.f64 %fd271, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd275, %fd274;
@%p23 bra $L__BB0_22;
ld.shared.v2.f64 {%fd103, %fd104}, [%rd7];
ld.shared.v2.f64 {%fd107, %fd108}, [%rd10];
ld.shared.v2.f64 {%fd111, %fd112}, [%rd12];
mul.f64 %fd115, %fd111, %fd103;
add.f64 %fd116, %fd115, 0d0000000000000000;
sub.f64 %fd117, %fd107, %fd270;
mul.f64 %fd118, %fd271, %fd117;
fma.rn.f64 %fd119, %fd115, %fd118, 0d0000000000000000;
fma.rn.f64 %fd272, %fd118, %fd103, %fd272;
mul.f64 %fd120, %fd112, %fd104;
add.f64 %fd275, %fd116, %fd120;
sub.f64 %fd121, %fd108, %fd270;
mul.f64 %fd122, %fd271, %fd121;
fma.rn.f64 %fd274, %fd120, %fd122, %fd119;
fma.rn.f64 %fd273, %fd122, %fd104, %fd273;
$L__BB0_22:
st.shared.f64 [%rd8], %fd275;
bar.sync 0;
@%p26 bra $L__BB0_24;
ld.shared.f64 %fd123, [%rd9];
ld.shared.f64 %fd124, [%rd8];
add.f64 %fd125, %fd123, %fd124;
st.shared.f64 [%rd8], %fd125;
$L__BB0_24:
setp.lt.s32 %p27, %r17, 4;
bar.sync 0;
@%p27 bra $L__BB0_29;
mov.u32 %r317, %r18;
$L__BB0_26:
setp.ge.u32 %p28, %r6, %r317;
@%p28 bra $L__BB0_28;
add.s32 %r201, %r317, %r16;
mul.wide.s32 %rd79, %r201, 8;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f64 %fd126, [%rd8];
ld.shared.f64 %fd127, [%rd81];
add.f64 %fd128, %fd127, %fd126;
st.shared.f64 [%rd8], %fd128;
$L__BB0_28:
bar.sync 0;
shr.u32 %r24, %r317, 1;
setp.gt.u32 %p29, %r317, 3;
mov.u32 %r317, %r24;
@%p29 bra $L__BB0_26;
$L__BB0_29:
setp.ne.s32 %p30, %r6, 0;
mov.f64 %fd276, 0d0000000000000000;
@%p30 bra $L__BB0_32;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f64 %fd130, [%rd8];
add.f64 %fd276, %fd130, 0d0000000000000000;
@%p31 bra $L__BB0_32;
ld.shared.f64 %fd131, [%rd11];
add.f64 %fd276, %fd276, %fd131;
$L__BB0_32:
bar.sync 0;
st.shared.f64 [%rd8], %fd274;
bar.sync 0;
@%p26 bra $L__BB0_34;
ld.shared.f64 %fd132, [%rd9];
ld.shared.f64 %fd133, [%rd8];
add.f64 %fd134, %fd132, %fd133;
st.shared.f64 [%rd8], %fd134;
$L__BB0_34:
setp.lt.s32 %p118, %r17, 4;
bar.sync 0;
@%p118 bra $L__BB0_39;
mov.u32 %r318, %r18;
$L__BB0_36:
setp.ge.u32 %p34, %r6, %r318;
@%p34 bra $L__BB0_38;
add.s32 %r202, %r318, %r16;
mul.wide.s32 %rd82, %r202, 8;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f64 %fd135, [%rd8];
ld.shared.f64 %fd136, [%rd84];
add.f64 %fd137, %fd136, %fd135;
st.shared.f64 [%rd8], %fd137;
$L__BB0_38:
bar.sync 0;
shr.u32 %r26, %r318, 1;
setp.gt.u32 %p35, %r318, 3;
mov.u32 %r318, %r26;
@%p35 bra $L__BB0_36;
$L__BB0_39:
mov.f64 %fd277, 0d0000000000000000;
@%p30 bra $L__BB0_42;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f64 %fd139, [%rd8];
add.f64 %fd277, %fd139, 0d0000000000000000;
@%p37 bra $L__BB0_42;
ld.shared.f64 %fd140, [%rd11];
add.f64 %fd277, %fd277, %fd140;
$L__BB0_42:
bar.sync 0;
@%p30 bra $L__BB0_44;
st.shared.f64 [%rd13], %fd276;
$L__BB0_44:
bar.sync 0;
ld.shared.f64 %fd33, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_46;
st.shared.f64 [%rd13], %fd277;
$L__BB0_46:
bar.sync 0;
ld.shared.f64 %fd34, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_48;
mul.f64 %fd141, %fd2, %fd271;
ld.shared.v2.f64 {%fd142, %fd143}, [%rd10];
ld.shared.v2.f64 {%fd146, %fd147}, [%rd12];
ld.shared.v2.f64 {%fd150, %fd151}, [%rd7];
mul.f64 %fd154, %fd146, %fd150;
mul.f64 %fd155, %fd154, %fd1;
sub.f64 %fd156, %fd142, %fd270;
mul.f64 %fd157, %fd271, %fd156;
sub.f64 %fd158, %fd155, %fd33;
mul.f64 %fd159, %fd34, %fd157;
sub.f64 %fd160, %fd158, %fd159;
mul.f64 %fd161, %fd141, %fd160;
mov.b64 %rd86, %fd161;
mul.f64 %fd162, %fd147, %fd151;
mul.f64 %fd163, %fd162, %fd1;
sub.f64 %fd164, %fd143, %fd270;
mul.f64 %fd165, %fd271, %fd164;
sub.f64 %fd166, %fd163, %fd33;
mul.f64 %fd167, %fd34, %fd165;
sub.f64 %fd168, %fd166, %fd167;
mul.f64 %fd169, %fd141, %fd168;
mov.b64 %rd87, %fd169;
mad.lo.s32 %r207, %r316, %r4, %r11;
mad.lo.s32 %r208, %r207, %r110, %r15;
mul.wide.s32 %rd88, %r208, 8;
add.s64 %rd85, %rd37, %rd88;
mov.b64 {%r203, %r204}, %rd86;
mov.b64 {%r205, %r206}, %rd87;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r203,%r204,%r205,%r206};
// end inline asm
$L__BB0_48:
add.s32 %r316, %r316, 1;
setp.lt.s32 %p41, %r316, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_49;
$L__BB0_3:
mov.f64 %fd272, 0d0000000000000000;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_49:
mov.u32 %r209, %tid.z;
mad.lo.s32 %r210, %r4, %r209, %r8;
mad.lo.s32 %r28, %r210, %r3, %r6;
mul.wide.u32 %rd89, %r28, 8;
add.s64 %rd23, %rd43, %rd89;
clz.b32 %r211, %r4;
mov.u32 %r212, 31;
sub.s32 %r213, %r212, %r211;
mov.u32 %r214, 1;
shl.b32 %r29, %r214, %r213;
setp.lt.u32 %p42, %r8, %r29;
add.s32 %r215, %r29, %r8;
setp.lt.u32 %p43, %r215, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r216, %r3, %r213;
add.s32 %r217, %r28, %r216;
mul.wide.s32 %rd91, %r217, 8;
add.s64 %rd24, %rd43, %rd91;
shr.u32 %r218, %r29, 31;
add.s32 %r219, %r29, %r218;
shr.s32 %r322, %r219, 1;
st.shared.f64 [%rd23], %fd272;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_51;
ld.shared.f64 %fd170, [%rd24];
ld.shared.f64 %fd171, [%rd23];
add.f64 %fd172, %fd170, %fd171;
st.shared.f64 [%rd23], %fd172;
$L__BB0_51:
setp.lt.s32 %p45, %r29, 4;
bar.sync 0;
@%p45 bra $L__BB0_56;
mov.u32 %r319, %r322;
$L__BB0_53:
setp.ge.u32 %p46, %r8, %r319;
@%p46 bra $L__BB0_55;
mad.lo.s32 %r220, %r319, %r3, %r28;
mul.wide.s32 %rd92, %r220, 8;
add.s64 %rd94, %rd43, %rd92;
ld.shared.f64 %fd173, [%rd23];
ld.shared.f64 %fd174, [%rd94];
add.f64 %fd175, %fd174, %fd173;
st.shared.f64 [%rd23], %fd175;
$L__BB0_55:
bar.sync 0;
shr.u32 %r32, %r319, 1;
setp.gt.u32 %p47, %r319, 3;
mov.u32 %r319, %r32;
@%p47 bra $L__BB0_53;
$L__BB0_56:
add.s32 %r221, %r28, %r3;
mul.wide.u32 %rd95, %r221, 8;
add.s64 %rd25, %rd43, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.f64 %fd282, 0d0000000000000000;
@%p48 bra $L__BB0_59;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f64 %fd177, [%rd23];
add.f64 %fd282, %fd177, 0d0000000000000000;
@%p49 bra $L__BB0_59;
ld.shared.f64 %fd178, [%rd25];
add.f64 %fd282, %fd282, %fd178;
$L__BB0_59:
bar.sync 0;
st.shared.f64 [%rd23], %fd273;
bar.sync 0;
@%p44 bra $L__BB0_61;
ld.shared.f64 %fd179, [%rd24];
ld.shared.f64 %fd180, [%rd23];
add.f64 %fd181, %fd179, %fd180;
st.shared.f64 [%rd23], %fd181;
$L__BB0_61:
bar.sync 0;
@%p45 bra $L__BB0_66;
mov.u32 %r320, %r322;
$L__BB0_63:
setp.ge.u32 %p52, %r8, %r320;
@%p52 bra $L__BB0_65;
mad.lo.s32 %r222, %r320, %r3, %r28;
mul.wide.s32 %rd97, %r222, 8;
add.s64 %rd99, %rd43, %rd97;
ld.shared.f64 %fd182, [%rd23];
ld.shared.f64 %fd183, [%rd99];
add.f64 %fd184, %fd183, %fd182;
st.shared.f64 [%rd23], %fd184;
$L__BB0_65:
bar.sync 0;
shr.u32 %r34, %r320, 1;
setp.gt.u32 %p53, %r320, 3;
mov.u32 %r320, %r34;
@%p53 bra $L__BB0_63;
$L__BB0_66:
mov.f64 %fd283, 0d0000000000000000;
@%p48 bra $L__BB0_69;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f64 %fd186, [%rd23];
add.f64 %fd283, %fd186, 0d0000000000000000;
@%p55 bra $L__BB0_69;
ld.shared.f64 %fd187, [%rd25];
add.f64 %fd283, %fd283, %fd187;
$L__BB0_69:
bar.sync 0;
st.shared.f64 [%rd23], %fd280;
bar.sync 0;
@%p44 bra $L__BB0_71;
ld.shared.f64 %fd188, [%rd24];
ld.shared.f64 %fd189, [%rd23];
add.f64 %fd190, %fd188, %fd189;
st.shared.f64 [%rd23], %fd190;
$L__BB0_71:
bar.sync 0;
@%p45 bra $L__BB0_76;
mov.u32 %r321, %r322;
$L__BB0_73:
setp.ge.u32 %p58, %r8, %r321;
@%p58 bra $L__BB0_75;
mad.lo.s32 %r223, %r321, %r3, %r28;
mul.wide.s32 %rd100, %r223, 8;
add.s64 %rd102, %rd43, %rd100;
ld.shared.f64 %fd191, [%rd23];
ld.shared.f64 %fd192, [%rd102];
add.f64 %fd193, %fd192, %fd191;
st.shared.f64 [%rd23], %fd193;
$L__BB0_75:
bar.sync 0;
shr.u32 %r36, %r321, 1;
setp.gt.u32 %p59, %r321, 3;
mov.u32 %r321, %r36;
@%p59 bra $L__BB0_73;
$L__BB0_76:
mov.f64 %fd284, 0d0000000000000000;
@%p48 bra $L__BB0_79;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f64 %fd195, [%rd23];
add.f64 %fd284, %fd195, 0d0000000000000000;
@%p61 bra $L__BB0_79;
ld.shared.f64 %fd196, [%rd25];
add.f64 %fd284, %fd284, %fd196;
$L__BB0_79:
bar.sync 0;
st.shared.f64 [%rd23], %fd281;
bar.sync 0;
@%p44 bra $L__BB0_81;
ld.shared.f64 %fd197, [%rd24];
ld.shared.f64 %fd198, [%rd23];
add.f64 %fd199, %fd197, %fd198;
st.shared.f64 [%rd23], %fd199;
$L__BB0_81:
bar.sync 0;
@%p45 bra $L__BB0_85;
$L__BB0_82:
setp.ge.u32 %p64, %r8, %r322;
@%p64 bra $L__BB0_84;
mad.lo.s32 %r224, %r322, %r3, %r28;
mul.wide.s32 %rd103, %r224, 8;
add.s64 %rd105, %rd43, %rd103;
ld.shared.f64 %fd200, [%rd23];
ld.shared.f64 %fd201, [%rd105];
add.f64 %fd202, %fd201, %fd200;
st.shared.f64 [%rd23], %fd202;
$L__BB0_84:
bar.sync 0;
shr.u32 %r38, %r322, 1;
setp.gt.u32 %p65, %r322, 3;
mov.u32 %r322, %r38;
@%p65 bra $L__BB0_82;
$L__BB0_85:
mov.f64 %fd285, 0d0000000000000000;
@%p48 bra $L__BB0_88;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f64 %fd204, [%rd23];
add.f64 %fd285, %fd204, 0d0000000000000000;
@%p67 bra $L__BB0_88;
ld.shared.f64 %fd205, [%rd25];
add.f64 %fd285, %fd285, %fd205;
$L__BB0_88:
setp.eq.s32 %p116, %r8, 0;
and.pred %p115, %p116, %p1;
bar.sync 0;
@%p115 bra $L__BB0_89;
bra.uni $L__BB0_90;
$L__BB0_89:
shl.b32 %r313, %r6, 1;
mov.u32 %r233, %ctaid.y;
mad.lo.s32 %r234, %r110, %r233, %r313;
mul.wide.s32 %rd108, %r234, 8;
add.s64 %rd106, %rd40, %rd108;
mov.b64 %rd109, %fd282;
mov.b64 {%r225, %r226}, %rd109;
mov.b64 %rd110, %fd283;
mov.b64 {%r227, %r228}, %rd110;
// begin inline asm
st.volatile.global.v4.s32 [%rd106], {%r225,%r226,%r227,%r228};
// end inline asm
add.s64 %rd107, %rd41, %rd108;
mov.b64 %rd111, %fd284;
mov.b64 {%r229, %r230}, %rd111;
mov.b64 %rd112, %fd285;
mov.b64 {%r231, %r232}, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd107], {%r229,%r230,%r231,%r232};
// end inline asm
$L__BB0_90:
mov.u32 %r39, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r235, %r6, %r8;
or.b32 %r237, %r235, %r209;
setp.ne.s32 %p68, %r237, 0;
@%p68 bra $L__BB0_94;
ld.param.u64 %rd157, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd157;
mov.u32 %r238, %ctaid.x;
mov.u32 %r239, %ctaid.z;
mov.u32 %r240, %nctaid.x;
mad.lo.s32 %r241, %r239, %r240, %r238;
mul.wide.s32 %rd114, %r241, 8;
add.s64 %rd28, %rd113, %rd114;
add.s32 %r242, %r9, -1;
setp.eq.s32 %p69, %r39, %r242;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p69;
atom.global.add.u64 %rd29, [%rd28], %rd118;
ld.volatile.global.u64 %rd119, [%rd28];
xor.b64 %rd120, %rd119, %rd29;
setp.lt.s64 %p70, %rd120, 0;
@%p70 bra $L__BB0_94;
mov.u32 %r323, 8;
$L__BB0_93:
// begin inline asm
nanosleep.u32 %r323;
// end inline asm
setp.lt.u32 %p71, %r323, 256;
selp.u32 %r245, 1, 0, %p71;
shl.b32 %r323, %r323, %r245;
ld.volatile.global.u64 %rd121, [%rd28];
xor.b64 %rd122, %rd121, %rd29;
setp.gt.s64 %p72, %rd122, -1;
@%p72 bra $L__BB0_93;
$L__BB0_94:
bar.sync 0;
add.s32 %r246, %r9, %r3;
add.s32 %r247, %r246, -1;
div.s32 %r42, %r247, %r3;
setp.lt.s32 %p73, %r42, 1;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd291, %fd290;
@%p73 bra $L__BB0_100;
add.s32 %r249, %r4, %r2;
add.s32 %r250, %r249, -1;
shl.b32 %r251, %r8, 1;
shl.b32 %r252, %r4, 1;
mad.lo.s32 %r253, %r252, %r39, %r251;
or.b32 %r254, %r253, 1;
setp.ge.s32 %p74, %r254, %r110;
div.s32 %r255, %r250, %r4;
setp.ge.s32 %p75, %r39, %r255;
or.pred %p6, %p75, %p74;
mul.lo.s32 %r256, %r4, %r39;
shl.b32 %r257, %r256, 1;
mad.lo.s32 %r258, %r110, %r6, %r257;
add.s32 %r325, %r258, %r251;
mul.lo.s32 %r44, %r110, %r3;
mov.u32 %r326, 0;
mov.f64 %fd209, 0d0000000000000000;
mov.u32 %r324, %r6;
mov.f64 %fd290, %fd209;
mov.f64 %fd291, %fd209;
$L__BB0_96:
.pragma "nounroll";
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p6 bra $L__BB0_99;
setp.ge.s32 %p76, %r324, %r9;
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p76 bra $L__BB0_99;
mul.wide.s32 %rd124, %r325, 8;
add.s64 %rd123, %rd41, %rd124;
// begin inline asm
ld.volatile.global.v4.s32 {%r259,%r260,%r261,%r262}, [%rd123];
// end inline asm
mov.b64 %rd125, {%r259, %r260};
mov.b64 %fd289, %rd125;
mov.b64 %rd126, {%r261, %r262};
mov.b64 %fd288, %rd126;
$L__BB0_99:
add.f64 %fd290, %fd290, %fd289;
add.f64 %fd291, %fd291, %fd288;
add.s32 %r325, %r325, %r44;
add.s32 %r324, %r324, %r3;
add.s32 %r326, %r326, 1;
setp.lt.s32 %p77, %r326, %r42;
@%p77 bra $L__BB0_96;
$L__BB0_100:
clz.b32 %r263, %r3;
mov.u32 %r264, 31;
sub.s32 %r265, %r264, %r263;
mov.u32 %r266, 1;
shl.b32 %r51, %r266, %r265;
setp.lt.u32 %p78, %r6, %r51;
add.s32 %r267, %r51, %r6;
setp.lt.u32 %p79, %r267, %r3;
and.pred %p7, %p78, %p79;
add.s32 %r268, %r28, %r51;
mul.wide.s32 %rd127, %r268, 8;
add.s64 %rd30, %rd43, %rd127;
shr.u32 %r269, %r51, 31;
add.s32 %r270, %r51, %r269;
shr.s32 %r333, %r270, 1;
st.shared.f64 [%rd23], %fd290;
bar.sync 0;
not.pred %p80, %p7;
@%p80 bra $L__BB0_102;
ld.shared.f64 %fd214, [%rd30];
ld.shared.f64 %fd215, [%rd23];
add.f64 %fd216, %fd214, %fd215;
st.shared.f64 [%rd23], %fd216;
$L__BB0_102:
setp.lt.s32 %p81, %r51, 4;
bar.sync 0;
@%p81 bra $L__BB0_107;
mov.u32 %r327, %r333;
$L__BB0_104:
setp.ge.u32 %p82, %r6, %r327;
@%p82 bra $L__BB0_106;
add.s32 %r271, %r327, %r28;
mul.wide.s32 %rd129, %r271, 8;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f64 %fd217, [%rd23];
ld.shared.f64 %fd218, [%rd131];
add.f64 %fd219, %fd218, %fd217;
st.shared.f64 [%rd23], %fd219;
$L__BB0_106:
bar.sync 0;
shr.u32 %r54, %r327, 1;
setp.gt.u32 %p83, %r327, 3;
mov.u32 %r327, %r54;
@%p83 bra $L__BB0_104;
$L__BB0_107:
add.s32 %r272, %r28, 1;
mul.wide.u32 %rd132, %r272, 8;
add.s64 %rd31, %rd43, %rd132;
setp.ne.s32 %p84, %r6, 0;
mov.f64 %fd292, 0d0000000000000000;
@%p84 bra $L__BB0_110;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f64 %fd221, [%rd23];
add.f64 %fd292, %fd221, 0d0000000000000000;
@%p85 bra $L__BB0_110;
ld.shared.f64 %fd222, [%rd31];
add.f64 %fd292, %fd292, %fd222;
$L__BB0_110:
bar.sync 0;
st.shared.f64 [%rd23], %fd291;
bar.sync 0;
@%p80 bra $L__BB0_112;
ld.shared.f64 %fd223, [%rd30];
ld.shared.f64 %fd224, [%rd23];
add.f64 %fd225, %fd223, %fd224;
st.shared.f64 [%rd23], %fd225;
$L__BB0_112:
bar.sync 0;
@%p81 bra $L__BB0_117;
mov.u32 %r328, %r333;
$L__BB0_114:
setp.ge.u32 %p88, %r6, %r328;
@%p88 bra $L__BB0_116;
add.s32 %r273, %r328, %r28;
mul.wide.s32 %rd134, %r273, 8;
add.s64 %rd136, %rd43, %rd134;
ld.shared.f64 %fd226, [%rd23];
ld.shared.f64 %fd227, [%rd136];
add.f64 %fd228, %fd227, %fd226;
st.shared.f64 [%rd23], %fd228;
$L__BB0_116:
bar.sync 0;
shr.u32 %r56, %r328, 1;
setp.gt.u32 %p89, %r328, 3;
mov.u32 %r328, %r56;
@%p89 bra $L__BB0_114;
$L__BB0_117:
mov.f64 %fd293, 0d0000000000000000;
@%p84 bra $L__BB0_120;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f64 %fd230, [%rd23];
add.f64 %fd293, %fd230, 0d0000000000000000;
@%p91 bra $L__BB0_120;
ld.shared.f64 %fd231, [%rd31];
add.f64 %fd293, %fd293, %fd231;
$L__BB0_120:
bar.sync 0;
@%p84 bra $L__BB0_124;
add.s32 %r274, %r4, %r2;
add.s32 %r275, %r274, -1;
div.s32 %r276, %r275, %r4;
setp.ge.s32 %p93, %r39, %r276;
@%p93 bra $L__BB0_124;
shl.b32 %r57, %r8, 1;
mul.lo.s32 %r277, %r4, %r39;
shl.b32 %r58, %r277, 1;
add.s32 %r278, %r57, %r58;
or.b32 %r279, %r278, 1;
setp.ge.s32 %p94, %r279, %r110;
@%p94 bra $L__BB0_124;
ld.param.u64 %rd156, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r284, %r58, %r57;
mul.wide.s32 %rd138, %r284, 8;
add.s64 %rd137, %rd156, %rd138;
mov.b64 %rd139, %fd292;
mov.b64 {%r280, %r281}, %rd139;
mov.b64 %rd140, %fd293;
mov.b64 {%r282, %r283}, %rd140;
// begin inline asm
st.global.cs.v4.s32 [%rd137], {%r280,%r281,%r282,%r283};
// end inline asm
$L__BB0_124:
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd299, %fd298;
@%p73 bra $L__BB0_130;
add.s32 %r286, %r4, %r2;
add.s32 %r287, %r286, -1;
shl.b32 %r288, %r8, 1;
shl.b32 %r289, %r4, 1;
mad.lo.s32 %r290, %r289, %r39, %r288;
or.b32 %r291, %r290, 1;
setp.ge.s32 %p96, %r291, %r110;
div.s32 %r292, %r287, %r4;
setp.ge.s32 %p97, %r39, %r292;
or.pred %p8, %p97, %p96;
mul.lo.s32 %r293, %r4, %r39;
shl.b32 %r294, %r293, 1;
mad.lo.s32 %r295, %r110, %r6, %r294;
add.s32 %r330, %r295, %r288;
mul.lo.s32 %r60, %r110, %r3;
mov.u32 %r331, 0;
mov.f64 %fd235, 0d0000000000000000;
mov.u32 %r329, %r6;
mov.f64 %fd298, %fd235;
mov.f64 %fd299, %fd235;
$L__BB0_126:
.pragma "nounroll";
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p8 bra $L__BB0_129;
setp.ge.s32 %p98, %r329, %r9;
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p98 bra $L__BB0_129;
mul.wide.s32 %rd142, %r330, 8;
add.s64 %rd141, %rd40, %rd142;
// begin inline asm
ld.volatile.global.v4.s32 {%r296,%r297,%r298,%r299}, [%rd141];
// end inline asm
mov.b64 %rd143, {%r296, %r297};
mov.b64 %fd297, %rd143;
mov.b64 %rd144, {%r298, %r299};
mov.b64 %fd296, %rd144;
$L__BB0_129:
add.f64 %fd298, %fd298, %fd297;
add.f64 %fd299, %fd299, %fd296;
add.s32 %r330, %r330, %r60;
add.s32 %r329, %r329, %r3;
add.s32 %r331, %r331, 1;
setp.lt.s32 %p99, %r331, %r42;
@%p99 bra $L__BB0_126;
$L__BB0_130:
st.shared.f64 [%rd23], %fd298;
bar.sync 0;
@%p80 bra $L__BB0_132;
ld.shared.f64 %fd240, [%rd30];
ld.shared.f64 %fd241, [%rd23];
add.f64 %fd242, %fd240, %fd241;
st.shared.f64 [%rd23], %fd242;
$L__BB0_132:
bar.sync 0;
@%p81 bra $L__BB0_137;
mov.u32 %r332, %r333;
$L__BB0_134:
setp.ge.u32 %p102, %r6, %r332;
@%p102 bra $L__BB0_136;
add.s32 %r300, %r332, %r28;
mul.wide.s32 %rd145, %r300, 8;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f64 %fd243, [%rd23];
ld.shared.f64 %fd244, [%rd147];
add.f64 %fd245, %fd244, %fd243;
st.shared.f64 [%rd23], %fd245;
$L__BB0_136:
bar.sync 0;
shr.u32 %r68, %r332, 1;
setp.gt.u32 %p103, %r332, 3;
mov.u32 %r332, %r68;
@%p103 bra $L__BB0_134;
$L__BB0_137:
mov.f64 %fd300, 0d0000000000000000;
@%p84 bra $L__BB0_140;
setp.lt.u32 %p105, %r3, 2;
ld.shared.f64 %fd247, [%rd23];
add.f64 %fd300, %fd247, 0d0000000000000000;
@%p105 bra $L__BB0_140;
ld.shared.f64 %fd248, [%rd31];
add.f64 %fd300, %fd300, %fd248;
$L__BB0_140:
bar.sync 0;
st.shared.f64 [%rd23], %fd299;
bar.sync 0;
@%p80 bra $L__BB0_142;
ld.shared.f64 %fd249, [%rd30];
ld.shared.f64 %fd250, [%rd23];
add.f64 %fd251, %fd249, %fd250;
st.shared.f64 [%rd23], %fd251;
$L__BB0_142:
bar.sync 0;
@%p81 bra $L__BB0_146;
$L__BB0_143:
setp.ge.u32 %p108, %r6, %r333;
@%p108 bra $L__BB0_145;
add.s32 %r301, %r333, %r28;
mul.wide.s32 %rd148, %r301, 8;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f64 %fd252, [%rd23];
ld.shared.f64 %fd253, [%rd150];
add.f64 %fd254, %fd253, %fd252;
st.shared.f64 [%rd23], %fd254;
$L__BB0_145:
bar.sync 0;
shr.u32 %r70, %r333, 1;
setp.gt.u32 %p109, %r333, 3;
mov.u32 %r333, %r70;
@%p109 bra $L__BB0_143;
$L__BB0_146:
mov.f64 %fd301, 0d0000000000000000;
@%p84 bra $L__BB0_149;
setp.lt.u32 %p111, %r3, 2;
ld.shared.f64 %fd256, [%rd23];
add.f64 %fd301, %fd256, 0d0000000000000000;
@%p111 bra $L__BB0_149;
ld.shared.f64 %fd257, [%rd31];
add.f64 %fd301, %fd301, %fd257;
$L__BB0_149:
bar.sync 0;
@%p84 bra $L__BB0_153;
add.s32 %r302, %r4, %r2;
add.s32 %r303, %r302, -1;
div.s32 %r304, %r303, %r4;
setp.ge.s32 %p113, %r39, %r304;
@%p113 bra $L__BB0_153;
shl.b32 %r71, %r8, 1;
mul.lo.s32 %r305, %r4, %r39;
shl.b32 %r72, %r305, 1;
add.s32 %r306, %r71, %r72;
or.b32 %r307, %r306, 1;
setp.ge.s32 %p114, %r307, %r110;
@%p114 bra $L__BB0_153;
ld.param.u64 %rd155, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_e9b9611f_103399nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r312, %r72, %r71;
mul.wide.s32 %rd152, %r312, 8;
add.s64 %rd151, %rd155, %rd152;
mov.b64 %rd153, %fd300;
mov.b64 {%r308, %r309}, %rd153;
mov.b64 %rd154, %fd301;
mov.b64 {%r310, %r311}, %rd154;
// begin inline asm
st.global.cs.v4.s32 [%rd151], {%r308,%r309,%r310,%r311};
// end inline asm
$L__BB0_153:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72335arrayE[];
.entry _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<119>;
.reg .b32 %r<332>;
.reg .f64 %fd<302>;
.reg .b64 %rd<158>;
ld.param.v2.u32 {%r108, %r109}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r118, %r119}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r122, %r123}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r144, %r109, 1;
shr.u32 %r145, %r144, 31;
add.s32 %r146, %r144, %r145;
shr.s32 %r2, %r146, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r147, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r148, %r4, 3;
mad.lo.s32 %r149, %r148, %r147, 15;
and.b32 %r150, %r149, -16;
cvt.u64.u32 %rd1, %r150;
mul.lo.s32 %r151, %r4, %r2;
shl.b32 %r152, %r151, 4;
or.b32 %r153, %r152, 15;
and.b32 %r5, %r153, -16;
add.s32 %r154, %r153, %r5;
and.b32 %r155, %r154, -16;
cvt.s64.s32 %rd2, %r155;
mov.u64 %rd43, _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
cvt.rn.f64.s32 %fd1, %r109;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r156, %r7, 1;
setp.lt.s32 %p10, %r156, %r109;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r157, smem_ptr; }
// end inline asm
shl.b32 %r160, %r6, 4;
add.s32 %r158, %r157, %r160;
mul.wide.s32 %rd47, %r7, 8;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r159, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r159, 0;
cp.async.ca.shared.global [%r158], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r161, %r4, 215;
div.s32 %r162, %r161, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r163, %r9, %r162;
add.s32 %r164, %r163, -1;
div.s32 %r10, %r164, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r166, %ctaid.y;
mul.lo.s32 %r167, %r10, %r4;
mul.lo.s32 %r11, %r167, %r166;
mad.lo.s32 %r168, %r2, %r8, %r6;
shl.b32 %r12, %r168, 4;
mul.lo.s32 %r169, %r109, %r8;
cvt.s64.s32 %rd52, %r169;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r170, %r11, %r109;
cvt.s64.s32 %rd6, %r170;
mul.lo.s32 %r13, %r109, %r4;
mul.lo.s32 %r14, %r10, %r166;
shl.b32 %r171, %r8, 1;
mov.u32 %r172, 1;
mad.lo.s32 %r173, %r171, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r173, 8;
add.s64 %rd7, %rd54, %rd55;
mov.u32 %r174, %tid.z;
mad.lo.s32 %r175, %r4, %r174, %r8;
mad.lo.s32 %r15, %r175, %r3, %r6;
mul.wide.u32 %rd56, %r15, 8;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r176, %r3;
mov.u32 %r177, 31;
sub.s32 %r178, %r177, %r176;
shl.b32 %r16, %r172, %r178;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r179, %r16, %r6;
setp.lt.u32 %p15, %r179, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r180, %r15, %r16;
mul.wide.s32 %rd57, %r180, 8;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r181, %r16, 31;
add.s32 %r182, %r16, %r181;
shr.s32 %r17, %r182, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r183, %r15, 1;
mul.wide.u32 %rd58, %r183, 8;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r175, 8;
add.s64 %rd13, %rd43, %rd61;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r314, 0;
mov.f64 %fd272, 0d0000000000000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r186, smem_ptr; }
// end inline asm
add.s32 %r187, %r186, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r196, smem_ptr; }
// end inline asm
add.s32 %r197, %r196, %r12;
not.pred %p26, %p3;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r184, %r314, %r4, %r8;
add.s32 %r185, %r184, %r11;
setp.gt.s32 %p17, %r185, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r189, %r13, %r314;
cvt.s64.s32 %rd65, %r189;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r188, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r188, 0;
cp.async.ca.shared.global [%r187], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r190, %r14, %r314;
mad.lo.s32 %r191, %r190, %r4, %r8;
setp.lt.s32 %p19, %r191, 216;
@%p19 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd268, 0d0000000000000000;
mov.f64 %fd269, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r192, %r14, %r314;
mad.lo.s32 %r193, %r192, %r4, %r8;
setp.gt.s32 %p20, %r193, 215;
@%p20 bra $L__BB0_14;
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
$L__BB0_14:
add.s32 %r194, %r14, %r314;
mad.lo.s32 %r21, %r194, %r4, %r8;
add.f64 %fd281, %fd281, %fd269;
add.f64 %fd280, %fd280, %fd268;
setp.gt.s32 %p21, %r21, 215;
mov.f64 %fd270, 0d0000000000000000;
@%p21 bra $L__BB0_16;
mul.lo.s32 %r195, %r21, %r118;
mul.wide.s32 %rd69, %r195, 8;
add.s64 %rd70, %rd16, %rd69;
ld.global.f64 %fd270, [%rd70];
$L__BB0_16:
setp.lt.s32 %p22, %r21, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_18;
mul.lo.s32 %r199, %r13, %r314;
cvt.s64.s32 %rd73, %r199;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r198, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r198, 0;
cp.async.ca.shared.global [%r197], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
add.s32 %r313, %r14, %r314;
mad.lo.s32 %r312, %r313, %r4, %r8;
setp.gt.s32 %p117, %r312, 215;
mov.f64 %fd274, 0d0000000000000000;
mov.f64 %fd271, %fd274;
@%p117 bra $L__BB0_20;
mul.lo.s32 %r200, %r21, %r122;
mul.wide.s32 %rd77, %r200, 8;
add.s64 %rd78, %rd17, %rd77;
ld.global.f64 %fd271, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd275, %fd274;
@%p23 bra $L__BB0_22;
ld.shared.v2.f64 {%fd103, %fd104}, [%rd7];
ld.shared.v2.f64 {%fd107, %fd108}, [%rd10];
ld.shared.v2.f64 {%fd111, %fd112}, [%rd12];
mul.f64 %fd115, %fd111, %fd103;
add.f64 %fd116, %fd115, 0d0000000000000000;
sub.f64 %fd117, %fd107, %fd270;
mul.f64 %fd118, %fd271, %fd117;
fma.rn.f64 %fd119, %fd115, %fd118, 0d0000000000000000;
fma.rn.f64 %fd272, %fd118, %fd103, %fd272;
mul.f64 %fd120, %fd112, %fd104;
add.f64 %fd275, %fd116, %fd120;
sub.f64 %fd121, %fd108, %fd270;
mul.f64 %fd122, %fd271, %fd121;
fma.rn.f64 %fd274, %fd120, %fd122, %fd119;
fma.rn.f64 %fd273, %fd122, %fd104, %fd273;
$L__BB0_22:
st.shared.f64 [%rd8], %fd275;
bar.sync 0;
@%p26 bra $L__BB0_24;
ld.shared.f64 %fd123, [%rd9];
ld.shared.f64 %fd124, [%rd8];
add.f64 %fd125, %fd123, %fd124;
st.shared.f64 [%rd8], %fd125;
$L__BB0_24:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_29;
mov.u32 %r315, %r17;
$L__BB0_26:
setp.ge.u32 %p28, %r6, %r315;
@%p28 bra $L__BB0_28;
add.s32 %r201, %r315, %r15;
mul.wide.s32 %rd79, %r201, 8;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f64 %fd126, [%rd8];
ld.shared.f64 %fd127, [%rd81];
add.f64 %fd128, %fd127, %fd126;
st.shared.f64 [%rd8], %fd128;
$L__BB0_28:
bar.sync 0;
shr.u32 %r23, %r315, 1;
setp.gt.u32 %p29, %r315, 3;
mov.u32 %r315, %r23;
@%p29 bra $L__BB0_26;
$L__BB0_29:
setp.ne.s32 %p30, %r6, 0;
mov.f64 %fd276, 0d0000000000000000;
@%p30 bra $L__BB0_32;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f64 %fd130, [%rd8];
add.f64 %fd276, %fd130, 0d0000000000000000;
@%p31 bra $L__BB0_32;
ld.shared.f64 %fd131, [%rd11];
add.f64 %fd276, %fd276, %fd131;
$L__BB0_32:
bar.sync 0;
st.shared.f64 [%rd8], %fd274;
bar.sync 0;
@%p26 bra $L__BB0_34;
ld.shared.f64 %fd132, [%rd9];
ld.shared.f64 %fd133, [%rd8];
add.f64 %fd134, %fd132, %fd133;
st.shared.f64 [%rd8], %fd134;
$L__BB0_34:
setp.lt.s32 %p118, %r16, 4;
bar.sync 0;
@%p118 bra $L__BB0_39;
mov.u32 %r316, %r17;
$L__BB0_36:
setp.ge.u32 %p34, %r6, %r316;
@%p34 bra $L__BB0_38;
add.s32 %r202, %r316, %r15;
mul.wide.s32 %rd82, %r202, 8;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f64 %fd135, [%rd8];
ld.shared.f64 %fd136, [%rd84];
add.f64 %fd137, %fd136, %fd135;
st.shared.f64 [%rd8], %fd137;
$L__BB0_38:
bar.sync 0;
shr.u32 %r25, %r316, 1;
setp.gt.u32 %p35, %r316, 3;
mov.u32 %r316, %r25;
@%p35 bra $L__BB0_36;
$L__BB0_39:
mov.f64 %fd277, 0d0000000000000000;
@%p30 bra $L__BB0_42;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f64 %fd139, [%rd8];
add.f64 %fd277, %fd139, 0d0000000000000000;
@%p37 bra $L__BB0_42;
ld.shared.f64 %fd140, [%rd11];
add.f64 %fd277, %fd277, %fd140;
$L__BB0_42:
bar.sync 0;
@%p30 bra $L__BB0_44;
st.shared.f64 [%rd13], %fd276;
$L__BB0_44:
bar.sync 0;
ld.shared.f64 %fd33, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_46;
st.shared.f64 [%rd13], %fd277;
$L__BB0_46:
bar.sync 0;
ld.shared.f64 %fd34, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_48;
mul.f64 %fd141, %fd2, %fd271;
ld.shared.v2.f64 {%fd142, %fd143}, [%rd10];
ld.shared.v2.f64 {%fd146, %fd147}, [%rd12];
ld.shared.v2.f64 {%fd150, %fd151}, [%rd7];
mul.f64 %fd154, %fd146, %fd150;
mul.f64 %fd155, %fd154, %fd1;
sub.f64 %fd156, %fd142, %fd270;
mul.f64 %fd157, %fd271, %fd156;
sub.f64 %fd158, %fd155, %fd33;
mul.f64 %fd159, %fd34, %fd157;
sub.f64 %fd160, %fd158, %fd159;
mul.f64 %fd161, %fd141, %fd160;
mov.b64 %rd86, %fd161;
mul.f64 %fd162, %fd147, %fd151;
mul.f64 %fd163, %fd162, %fd1;
sub.f64 %fd164, %fd143, %fd270;
mul.f64 %fd165, %fd271, %fd164;
sub.f64 %fd166, %fd163, %fd33;
mul.f64 %fd167, %fd34, %fd165;
sub.f64 %fd168, %fd166, %fd167;
mul.f64 %fd169, %fd141, %fd168;
mov.b64 %rd87, %fd169;
mad.lo.s32 %r207, %r21, %r109, %r7;
mul.wide.s32 %rd88, %r207, 8;
add.s64 %rd85, %rd37, %rd88;
mov.b64 {%r203, %r204}, %rd86;
mov.b64 {%r205, %r206}, %rd87;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r203,%r204,%r205,%r206};
// end inline asm
$L__BB0_48:
add.s32 %r314, %r314, 1;
setp.lt.s32 %p41, %r314, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_49;
$L__BB0_3:
mov.f64 %fd272, 0d0000000000000000;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_49:
mov.u32 %r208, %tid.z;
mad.lo.s32 %r209, %r4, %r208, %r8;
mad.lo.s32 %r27, %r209, %r3, %r6;
mul.wide.u32 %rd89, %r27, 8;
add.s64 %rd23, %rd43, %rd89;
clz.b32 %r210, %r4;
mov.u32 %r211, 31;
sub.s32 %r212, %r211, %r210;
mov.u32 %r213, 1;
shl.b32 %r28, %r213, %r212;
setp.lt.u32 %p42, %r8, %r28;
add.s32 %r214, %r28, %r8;
setp.lt.u32 %p43, %r214, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r215, %r3, %r212;
add.s32 %r216, %r27, %r215;
mul.wide.s32 %rd91, %r216, 8;
add.s64 %rd24, %rd43, %rd91;
shr.u32 %r217, %r28, 31;
add.s32 %r218, %r28, %r217;
shr.s32 %r320, %r218, 1;
st.shared.f64 [%rd23], %fd272;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_51;
ld.shared.f64 %fd170, [%rd24];
ld.shared.f64 %fd171, [%rd23];
add.f64 %fd172, %fd170, %fd171;
st.shared.f64 [%rd23], %fd172;
$L__BB0_51:
setp.lt.s32 %p45, %r28, 4;
bar.sync 0;
@%p45 bra $L__BB0_56;
mov.u32 %r317, %r320;
$L__BB0_53:
setp.ge.u32 %p46, %r8, %r317;
@%p46 bra $L__BB0_55;
mad.lo.s32 %r219, %r317, %r3, %r27;
mul.wide.s32 %rd92, %r219, 8;
add.s64 %rd94, %rd43, %rd92;
ld.shared.f64 %fd173, [%rd23];
ld.shared.f64 %fd174, [%rd94];
add.f64 %fd175, %fd174, %fd173;
st.shared.f64 [%rd23], %fd175;
$L__BB0_55:
bar.sync 0;
shr.u32 %r31, %r317, 1;
setp.gt.u32 %p47, %r317, 3;
mov.u32 %r317, %r31;
@%p47 bra $L__BB0_53;
$L__BB0_56:
add.s32 %r220, %r27, %r3;
mul.wide.u32 %rd95, %r220, 8;
add.s64 %rd25, %rd43, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.f64 %fd282, 0d0000000000000000;
@%p48 bra $L__BB0_59;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f64 %fd177, [%rd23];
add.f64 %fd282, %fd177, 0d0000000000000000;
@%p49 bra $L__BB0_59;
ld.shared.f64 %fd178, [%rd25];
add.f64 %fd282, %fd282, %fd178;
$L__BB0_59:
bar.sync 0;
st.shared.f64 [%rd23], %fd273;
bar.sync 0;
@%p44 bra $L__BB0_61;
ld.shared.f64 %fd179, [%rd24];
ld.shared.f64 %fd180, [%rd23];
add.f64 %fd181, %fd179, %fd180;
st.shared.f64 [%rd23], %fd181;
$L__BB0_61:
bar.sync 0;
@%p45 bra $L__BB0_66;
mov.u32 %r318, %r320;
$L__BB0_63:
setp.ge.u32 %p52, %r8, %r318;
@%p52 bra $L__BB0_65;
mad.lo.s32 %r221, %r318, %r3, %r27;
mul.wide.s32 %rd97, %r221, 8;
add.s64 %rd99, %rd43, %rd97;
ld.shared.f64 %fd182, [%rd23];
ld.shared.f64 %fd183, [%rd99];
add.f64 %fd184, %fd183, %fd182;
st.shared.f64 [%rd23], %fd184;
$L__BB0_65:
bar.sync 0;
shr.u32 %r33, %r318, 1;
setp.gt.u32 %p53, %r318, 3;
mov.u32 %r318, %r33;
@%p53 bra $L__BB0_63;
$L__BB0_66:
mov.f64 %fd283, 0d0000000000000000;
@%p48 bra $L__BB0_69;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f64 %fd186, [%rd23];
add.f64 %fd283, %fd186, 0d0000000000000000;
@%p55 bra $L__BB0_69;
ld.shared.f64 %fd187, [%rd25];
add.f64 %fd283, %fd283, %fd187;
$L__BB0_69:
bar.sync 0;
st.shared.f64 [%rd23], %fd280;
bar.sync 0;
@%p44 bra $L__BB0_71;
ld.shared.f64 %fd188, [%rd24];
ld.shared.f64 %fd189, [%rd23];
add.f64 %fd190, %fd188, %fd189;
st.shared.f64 [%rd23], %fd190;
$L__BB0_71:
bar.sync 0;
@%p45 bra $L__BB0_76;
mov.u32 %r319, %r320;
$L__BB0_73:
setp.ge.u32 %p58, %r8, %r319;
@%p58 bra $L__BB0_75;
mad.lo.s32 %r222, %r319, %r3, %r27;
mul.wide.s32 %rd100, %r222, 8;
add.s64 %rd102, %rd43, %rd100;
ld.shared.f64 %fd191, [%rd23];
ld.shared.f64 %fd192, [%rd102];
add.f64 %fd193, %fd192, %fd191;
st.shared.f64 [%rd23], %fd193;
$L__BB0_75:
bar.sync 0;
shr.u32 %r35, %r319, 1;
setp.gt.u32 %p59, %r319, 3;
mov.u32 %r319, %r35;
@%p59 bra $L__BB0_73;
$L__BB0_76:
mov.f64 %fd284, 0d0000000000000000;
@%p48 bra $L__BB0_79;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f64 %fd195, [%rd23];
add.f64 %fd284, %fd195, 0d0000000000000000;
@%p61 bra $L__BB0_79;
ld.shared.f64 %fd196, [%rd25];
add.f64 %fd284, %fd284, %fd196;
$L__BB0_79:
bar.sync 0;
st.shared.f64 [%rd23], %fd281;
bar.sync 0;
@%p44 bra $L__BB0_81;
ld.shared.f64 %fd197, [%rd24];
ld.shared.f64 %fd198, [%rd23];
add.f64 %fd199, %fd197, %fd198;
st.shared.f64 [%rd23], %fd199;
$L__BB0_81:
bar.sync 0;
@%p45 bra $L__BB0_85;
$L__BB0_82:
setp.ge.u32 %p64, %r8, %r320;
@%p64 bra $L__BB0_84;
mad.lo.s32 %r223, %r320, %r3, %r27;
mul.wide.s32 %rd103, %r223, 8;
add.s64 %rd105, %rd43, %rd103;
ld.shared.f64 %fd200, [%rd23];
ld.shared.f64 %fd201, [%rd105];
add.f64 %fd202, %fd201, %fd200;
st.shared.f64 [%rd23], %fd202;
$L__BB0_84:
bar.sync 0;
shr.u32 %r37, %r320, 1;
setp.gt.u32 %p65, %r320, 3;
mov.u32 %r320, %r37;
@%p65 bra $L__BB0_82;
$L__BB0_85:
mov.f64 %fd285, 0d0000000000000000;
@%p48 bra $L__BB0_88;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f64 %fd204, [%rd23];
add.f64 %fd285, %fd204, 0d0000000000000000;
@%p67 bra $L__BB0_88;
ld.shared.f64 %fd205, [%rd25];
add.f64 %fd285, %fd285, %fd205;
$L__BB0_88:
setp.eq.s32 %p116, %r8, 0;
and.pred %p115, %p116, %p1;
bar.sync 0;
@%p115 bra $L__BB0_89;
bra.uni $L__BB0_90;
$L__BB0_89:
mov.u32 %r232, %ctaid.y;
mad.lo.s32 %r233, %r109, %r232, %r7;
mul.wide.s32 %rd108, %r233, 8;
add.s64 %rd106, %rd40, %rd108;
mov.b64 %rd109, %fd282;
mov.b64 {%r224, %r225}, %rd109;
mov.b64 %rd110, %fd283;
mov.b64 {%r226, %r227}, %rd110;
// begin inline asm
st.volatile.global.v4.s32 [%rd106], {%r224,%r225,%r226,%r227};
// end inline asm
add.s64 %rd107, %rd41, %rd108;
mov.b64 %rd111, %fd284;
mov.b64 {%r228, %r229}, %rd111;
mov.b64 %rd112, %fd285;
mov.b64 {%r230, %r231}, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd107], {%r228,%r229,%r230,%r231};
// end inline asm
$L__BB0_90:
mov.u32 %r38, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r234, %r6, %r8;
or.b32 %r236, %r234, %r208;
setp.ne.s32 %p68, %r236, 0;
@%p68 bra $L__BB0_94;
ld.param.u64 %rd157, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd157;
mov.u32 %r237, %ctaid.x;
mov.u32 %r238, %ctaid.z;
mov.u32 %r239, %nctaid.x;
mad.lo.s32 %r240, %r238, %r239, %r237;
mul.wide.s32 %rd114, %r240, 8;
add.s64 %rd28, %rd113, %rd114;
add.s32 %r241, %r9, -1;
setp.eq.s32 %p69, %r38, %r241;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p69;
atom.global.add.u64 %rd29, [%rd28], %rd118;
ld.volatile.global.u64 %rd119, [%rd28];
xor.b64 %rd120, %rd119, %rd29;
setp.lt.s64 %p70, %rd120, 0;
@%p70 bra $L__BB0_94;
mov.u32 %r321, 8;
$L__BB0_93:
// begin inline asm
nanosleep.u32 %r321;
// end inline asm
setp.lt.u32 %p71, %r321, 256;
selp.u32 %r244, 1, 0, %p71;
shl.b32 %r321, %r321, %r244;
ld.volatile.global.u64 %rd121, [%rd28];
xor.b64 %rd122, %rd121, %rd29;
setp.gt.s64 %p72, %rd122, -1;
@%p72 bra $L__BB0_93;
$L__BB0_94:
bar.sync 0;
add.s32 %r245, %r9, %r3;
add.s32 %r246, %r245, -1;
div.s32 %r41, %r246, %r3;
setp.lt.s32 %p73, %r41, 1;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd291, %fd290;
@%p73 bra $L__BB0_100;
add.s32 %r248, %r4, %r2;
add.s32 %r249, %r248, -1;
shl.b32 %r250, %r8, 1;
shl.b32 %r251, %r4, 1;
mad.lo.s32 %r252, %r251, %r38, %r250;
or.b32 %r253, %r252, 1;
setp.ge.s32 %p74, %r253, %r109;
div.s32 %r254, %r249, %r4;
setp.ge.s32 %p75, %r38, %r254;
or.pred %p6, %p75, %p74;
mul.lo.s32 %r255, %r4, %r38;
shl.b32 %r256, %r255, 1;
mad.lo.s32 %r257, %r109, %r6, %r256;
add.s32 %r323, %r257, %r250;
mul.lo.s32 %r43, %r109, %r3;
mov.u32 %r324, 0;
mov.f64 %fd209, 0d0000000000000000;
mov.u32 %r322, %r6;
mov.f64 %fd290, %fd209;
mov.f64 %fd291, %fd209;
$L__BB0_96:
.pragma "nounroll";
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p6 bra $L__BB0_99;
setp.ge.s32 %p76, %r322, %r9;
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p76 bra $L__BB0_99;
mul.wide.s32 %rd124, %r323, 8;
add.s64 %rd123, %rd41, %rd124;
// begin inline asm
ld.volatile.global.v4.s32 {%r258,%r259,%r260,%r261}, [%rd123];
// end inline asm
mov.b64 %rd125, {%r258, %r259};
mov.b64 %fd289, %rd125;
mov.b64 %rd126, {%r260, %r261};
mov.b64 %fd288, %rd126;
$L__BB0_99:
add.f64 %fd290, %fd290, %fd289;
add.f64 %fd291, %fd291, %fd288;
add.s32 %r323, %r323, %r43;
add.s32 %r322, %r322, %r3;
add.s32 %r324, %r324, 1;
setp.lt.s32 %p77, %r324, %r41;
@%p77 bra $L__BB0_96;
$L__BB0_100:
clz.b32 %r262, %r3;
mov.u32 %r263, 31;
sub.s32 %r264, %r263, %r262;
mov.u32 %r265, 1;
shl.b32 %r50, %r265, %r264;
setp.lt.u32 %p78, %r6, %r50;
add.s32 %r266, %r50, %r6;
setp.lt.u32 %p79, %r266, %r3;
and.pred %p7, %p78, %p79;
add.s32 %r267, %r27, %r50;
mul.wide.s32 %rd127, %r267, 8;
add.s64 %rd30, %rd43, %rd127;
shr.u32 %r268, %r50, 31;
add.s32 %r269, %r50, %r268;
shr.s32 %r331, %r269, 1;
st.shared.f64 [%rd23], %fd290;
bar.sync 0;
not.pred %p80, %p7;
@%p80 bra $L__BB0_102;
ld.shared.f64 %fd214, [%rd30];
ld.shared.f64 %fd215, [%rd23];
add.f64 %fd216, %fd214, %fd215;
st.shared.f64 [%rd23], %fd216;
$L__BB0_102:
setp.lt.s32 %p81, %r50, 4;
bar.sync 0;
@%p81 bra $L__BB0_107;
mov.u32 %r325, %r331;
$L__BB0_104:
setp.ge.u32 %p82, %r6, %r325;
@%p82 bra $L__BB0_106;
add.s32 %r270, %r325, %r27;
mul.wide.s32 %rd129, %r270, 8;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f64 %fd217, [%rd23];
ld.shared.f64 %fd218, [%rd131];
add.f64 %fd219, %fd218, %fd217;
st.shared.f64 [%rd23], %fd219;
$L__BB0_106:
bar.sync 0;
shr.u32 %r53, %r325, 1;
setp.gt.u32 %p83, %r325, 3;
mov.u32 %r325, %r53;
@%p83 bra $L__BB0_104;
$L__BB0_107:
add.s32 %r271, %r27, 1;
mul.wide.u32 %rd132, %r271, 8;
add.s64 %rd31, %rd43, %rd132;
setp.ne.s32 %p84, %r6, 0;
mov.f64 %fd292, 0d0000000000000000;
@%p84 bra $L__BB0_110;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f64 %fd221, [%rd23];
add.f64 %fd292, %fd221, 0d0000000000000000;
@%p85 bra $L__BB0_110;
ld.shared.f64 %fd222, [%rd31];
add.f64 %fd292, %fd292, %fd222;
$L__BB0_110:
bar.sync 0;
st.shared.f64 [%rd23], %fd291;
bar.sync 0;
@%p80 bra $L__BB0_112;
ld.shared.f64 %fd223, [%rd30];
ld.shared.f64 %fd224, [%rd23];
add.f64 %fd225, %fd223, %fd224;
st.shared.f64 [%rd23], %fd225;
$L__BB0_112:
bar.sync 0;
@%p81 bra $L__BB0_117;
mov.u32 %r326, %r331;
$L__BB0_114:
setp.ge.u32 %p88, %r6, %r326;
@%p88 bra $L__BB0_116;
add.s32 %r272, %r326, %r27;
mul.wide.s32 %rd134, %r272, 8;
add.s64 %rd136, %rd43, %rd134;
ld.shared.f64 %fd226, [%rd23];
ld.shared.f64 %fd227, [%rd136];
add.f64 %fd228, %fd227, %fd226;
st.shared.f64 [%rd23], %fd228;
$L__BB0_116:
bar.sync 0;
shr.u32 %r55, %r326, 1;
setp.gt.u32 %p89, %r326, 3;
mov.u32 %r326, %r55;
@%p89 bra $L__BB0_114;
$L__BB0_117:
mov.f64 %fd293, 0d0000000000000000;
@%p84 bra $L__BB0_120;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f64 %fd230, [%rd23];
add.f64 %fd293, %fd230, 0d0000000000000000;
@%p91 bra $L__BB0_120;
ld.shared.f64 %fd231, [%rd31];
add.f64 %fd293, %fd293, %fd231;
$L__BB0_120:
bar.sync 0;
@%p84 bra $L__BB0_124;
add.s32 %r273, %r4, %r2;
add.s32 %r274, %r273, -1;
div.s32 %r275, %r274, %r4;
setp.ge.s32 %p93, %r38, %r275;
@%p93 bra $L__BB0_124;
shl.b32 %r56, %r8, 1;
mul.lo.s32 %r276, %r4, %r38;
shl.b32 %r57, %r276, 1;
add.s32 %r277, %r56, %r57;
or.b32 %r278, %r277, 1;
setp.ge.s32 %p94, %r278, %r109;
@%p94 bra $L__BB0_124;
ld.param.u64 %rd156, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r283, %r57, %r56;
mul.wide.s32 %rd138, %r283, 8;
add.s64 %rd137, %rd156, %rd138;
mov.b64 %rd139, %fd292;
mov.b64 {%r279, %r280}, %rd139;
mov.b64 %rd140, %fd293;
mov.b64 {%r281, %r282}, %rd140;
// begin inline asm
st.global.cs.v4.s32 [%rd137], {%r279,%r280,%r281,%r282};
// end inline asm
$L__BB0_124:
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd299, %fd298;
@%p73 bra $L__BB0_130;
add.s32 %r285, %r4, %r2;
add.s32 %r286, %r285, -1;
shl.b32 %r287, %r8, 1;
shl.b32 %r288, %r4, 1;
mad.lo.s32 %r289, %r288, %r38, %r287;
or.b32 %r290, %r289, 1;
setp.ge.s32 %p96, %r290, %r109;
div.s32 %r291, %r286, %r4;
setp.ge.s32 %p97, %r38, %r291;
or.pred %p8, %p97, %p96;
mul.lo.s32 %r292, %r4, %r38;
shl.b32 %r293, %r292, 1;
mad.lo.s32 %r294, %r109, %r6, %r293;
add.s32 %r328, %r294, %r287;
mul.lo.s32 %r59, %r109, %r3;
mov.u32 %r329, 0;
mov.f64 %fd235, 0d0000000000000000;
mov.u32 %r327, %r6;
mov.f64 %fd298, %fd235;
mov.f64 %fd299, %fd235;
$L__BB0_126:
.pragma "nounroll";
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p8 bra $L__BB0_129;
setp.ge.s32 %p98, %r327, %r9;
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p98 bra $L__BB0_129;
mul.wide.s32 %rd142, %r328, 8;
add.s64 %rd141, %rd40, %rd142;
// begin inline asm
ld.volatile.global.v4.s32 {%r295,%r296,%r297,%r298}, [%rd141];
// end inline asm
mov.b64 %rd143, {%r295, %r296};
mov.b64 %fd297, %rd143;
mov.b64 %rd144, {%r297, %r298};
mov.b64 %fd296, %rd144;
$L__BB0_129:
add.f64 %fd298, %fd298, %fd297;
add.f64 %fd299, %fd299, %fd296;
add.s32 %r328, %r328, %r59;
add.s32 %r327, %r327, %r3;
add.s32 %r329, %r329, 1;
setp.lt.s32 %p99, %r329, %r41;
@%p99 bra $L__BB0_126;
$L__BB0_130:
st.shared.f64 [%rd23], %fd298;
bar.sync 0;
@%p80 bra $L__BB0_132;
ld.shared.f64 %fd240, [%rd30];
ld.shared.f64 %fd241, [%rd23];
add.f64 %fd242, %fd240, %fd241;
st.shared.f64 [%rd23], %fd242;
$L__BB0_132:
bar.sync 0;
@%p81 bra $L__BB0_137;
mov.u32 %r330, %r331;
$L__BB0_134:
setp.ge.u32 %p102, %r6, %r330;
@%p102 bra $L__BB0_136;
add.s32 %r299, %r330, %r27;
mul.wide.s32 %rd145, %r299, 8;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f64 %fd243, [%rd23];
ld.shared.f64 %fd244, [%rd147];
add.f64 %fd245, %fd244, %fd243;
st.shared.f64 [%rd23], %fd245;
$L__BB0_136:
bar.sync 0;
shr.u32 %r67, %r330, 1;
setp.gt.u32 %p103, %r330, 3;
mov.u32 %r330, %r67;
@%p103 bra $L__BB0_134;
$L__BB0_137:
mov.f64 %fd300, 0d0000000000000000;
@%p84 bra $L__BB0_140;
setp.lt.u32 %p105, %r3, 2;
ld.shared.f64 %fd247, [%rd23];
add.f64 %fd300, %fd247, 0d0000000000000000;
@%p105 bra $L__BB0_140;
ld.shared.f64 %fd248, [%rd31];
add.f64 %fd300, %fd300, %fd248;
$L__BB0_140:
bar.sync 0;
st.shared.f64 [%rd23], %fd299;
bar.sync 0;
@%p80 bra $L__BB0_142;
ld.shared.f64 %fd249, [%rd30];
ld.shared.f64 %fd250, [%rd23];
add.f64 %fd251, %fd249, %fd250;
st.shared.f64 [%rd23], %fd251;
$L__BB0_142:
bar.sync 0;
@%p81 bra $L__BB0_146;
$L__BB0_143:
setp.ge.u32 %p108, %r6, %r331;
@%p108 bra $L__BB0_145;
add.s32 %r300, %r331, %r27;
mul.wide.s32 %rd148, %r300, 8;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f64 %fd252, [%rd23];
ld.shared.f64 %fd253, [%rd150];
add.f64 %fd254, %fd253, %fd252;
st.shared.f64 [%rd23], %fd254;
$L__BB0_145:
bar.sync 0;
shr.u32 %r69, %r331, 1;
setp.gt.u32 %p109, %r331, 3;
mov.u32 %r331, %r69;
@%p109 bra $L__BB0_143;
$L__BB0_146:
mov.f64 %fd301, 0d0000000000000000;
@%p84 bra $L__BB0_149;
setp.lt.u32 %p111, %r3, 2;
ld.shared.f64 %fd256, [%rd23];
add.f64 %fd301, %fd256, 0d0000000000000000;
@%p111 bra $L__BB0_149;
ld.shared.f64 %fd257, [%rd31];
add.f64 %fd301, %fd301, %fd257;
$L__BB0_149:
bar.sync 0;
@%p84 bra $L__BB0_153;
add.s32 %r301, %r4, %r2;
add.s32 %r302, %r301, -1;
div.s32 %r303, %r302, %r4;
setp.ge.s32 %p113, %r38, %r303;
@%p113 bra $L__BB0_153;
shl.b32 %r70, %r8, 1;
mul.lo.s32 %r304, %r4, %r38;
shl.b32 %r71, %r304, 1;
add.s32 %r305, %r70, %r71;
or.b32 %r306, %r305, 1;
setp.ge.s32 %p114, %r306, %r109;
@%p114 bra $L__BB0_153;
ld.param.u64 %rd155, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_2_cu_0b55d258_72339nvfuser_2ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r311, %r71, %r70;
mul.wide.s32 %rd152, %r311, 8;
add.s64 %rd151, %rd155, %rd152;
mov.b64 %rd153, %fd300;
mov.b64 {%r307, %r308}, %rd153;
mov.b64 %rd154, %fd301;
mov.b64 {%r309, %r310}, %rd154;
// begin inline asm
st.global.cs.v4.s32 [%rd151], {%r307,%r308,%r309,%r310};
// end inline asm
$L__BB0_153:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -29,175 +29,175 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<119>;
- .reg .b32 %r<334>;
+ .reg .b32 %r<332>;
.reg .f64 %fd<302>;
.reg .b64 %rd<158>;
- ld.param.v2.u32 {%r109, %r110}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r119, %r120}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r123, %r124}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r108, %r109}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r118, %r119}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r122, %r123}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r145, %r110, 1;
- shr.u32 %r146, %r145, 31;
- add.s32 %r147, %r145, %r146;
- shr.s32 %r2, %r147, 1;
+ add.s32 %r144, %r109, 1;
+ shr.u32 %r145, %r144, 31;
+ add.s32 %r146, %r144, %r145;
+ shr.s32 %r2, %r146, 1;
mov.u32 %r3, %ntid.x;
- max.s32 %r148, %r2, %r3;
+ max.s32 %r147, %r2, %r3;
mov.u32 %r4, %ntid.y;
- shl.b32 %r149, %r4, 3;
- mad.lo.s32 %r150, %r149, %r148, 15;
- and.b32 %r151, %r150, -16;
- cvt.u64.u32 %rd1, %r151;
- mul.lo.s32 %r152, %r4, %r2;
- shl.b32 %r153, %r152, 4;
- or.b32 %r154, %r153, 15;
- and.b32 %r5, %r154, -16;
- add.s32 %r155, %r154, %r5;
- and.b32 %r156, %r155, -16;
- cvt.s64.s32 %rd2, %r156;
+ shl.b32 %r148, %r4, 3;
+ mad.lo.s32 %r149, %r148, %r147, 15;
+ and.b32 %r150, %r149, -16;
+ cvt.u64.u32 %rd1, %r150;
+ mul.lo.s32 %r151, %r4, %r2;
+ shl.b32 %r152, %r151, 4;
+ or.b32 %r153, %r152, 15;
+ and.b32 %r5, %r153, -16;
+ add.s32 %r154, %r153, %r5;
+ and.b32 %r155, %r154, -16;
+ cvt.s64.s32 %rd2, %r155;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
- cvt.rn.f64.s32 %fd1, %r110;
+ cvt.rn.f64.s32 %fd1, %r109;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 1;
- or.b32 %r157, %r7, 1;
- setp.lt.s32 %p10, %r157, %r110;
+ or.b32 %r156, %r7, 1;
+ setp.lt.s32 %p10, %r156, %r109;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r158, smem_ptr; }
-
-
- shl.b32 %r161, %r6, 4;
- add.s32 %r159, %r158, %r161;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r157, smem_ptr; }
+
+
+ shl.b32 %r160, %r6, 4;
+ add.s32 %r158, %r157, %r160;
mul.wide.s32 %rd47, %r7, 8;
add.s64 %rd46, %rd36, %rd47;
- mov.u32 %r160, 0;
+ mov.u32 %r159, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r160, 0;
- cp.async.ca.shared.global [%r159], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r159, 0;
+ cp.async.ca.shared.global [%r158], [%rd46], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r162, %r4, 215;
- div.s32 %r163, %r162, %r4;
+ add.s32 %r161, %r4, 215;
+ div.s32 %r162, %r161, %r4;
mov.u32 %r9, %nctaid.y;
- add.s32 %r164, %r9, %r163;
- add.s32 %r165, %r164, -1;
- div.s32 %r10, %r165, %r9;
+ add.s32 %r163, %r9, %r162;
+ add.s32 %r164, %r163, -1;
+ div.s32 %r10, %r164, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
- mov.u32 %r167, %ctaid.y;
- mul.lo.s32 %r168, %r10, %r4;
- mul.lo.s32 %r11, %r168, %r167;
- shl.b32 %r169, %r8, 3;
- shl.b32 %r170, %r6, 4;
- mad.lo.s32 %r12, %r169, %r110, %r170;
- mul.lo.s32 %r171, %r110, %r8;
- cvt.s64.s32 %rd52, %r171;
+ mov.u32 %r166, %ctaid.y;
+ mul.lo.s32 %r167, %r10, %r4;
+ mul.lo.s32 %r11, %r167, %r166;
+ mad.lo.s32 %r168, %r2, %r8, %r6;
+ shl.b32 %r12, %r168, 4;
+ mul.lo.s32 %r169, %r109, %r8;
+ cvt.s64.s32 %rd52, %r169;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r172, %r11, %r110;
- cvt.s64.s32 %rd6, %r172;
- mul.lo.s32 %r13, %r110, %r4;
- mul.lo.s32 %r14, %r10, %r167;
- add.s32 %r15, %r171, %r7;
+ mul.lo.s32 %r170, %r11, %r109;
+ cvt.s64.s32 %rd6, %r170;
+ mul.lo.s32 %r13, %r109, %r4;
+ mul.lo.s32 %r14, %r10, %r166;
+ shl.b32 %r171, %r8, 1;
+ mov.u32 %r172, 1;
+ mad.lo.s32 %r173, %r171, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
- mul.wide.s32 %rd55, %r15, 8;
+ mul.wide.s32 %rd55, %r173, 8;
add.s64 %rd7, %rd54, %rd55;
- mov.u32 %r173, %tid.z;
- mad.lo.s32 %r174, %r4, %r173, %r8;
- mad.lo.s32 %r16, %r174, %r3, %r6;
- mul.wide.u32 %rd56, %r16, 8;
+ mov.u32 %r174, %tid.z;
+ mad.lo.s32 %r175, %r4, %r174, %r8;
+ mad.lo.s32 %r15, %r175, %r3, %r6;
+ mul.wide.u32 %rd56, %r15, 8;
add.s64 %rd8, %rd43, %rd56;
- clz.b32 %r175, %r3;
- mov.u32 %r176, 31;
- sub.s32 %r177, %r176, %r175;
- mov.u32 %r178, 1;
- shl.b32 %r17, %r178, %r177;
- setp.lt.u32 %p14, %r6, %r17;
- add.s32 %r179, %r17, %r6;
+ clz.b32 %r176, %r3;
+ mov.u32 %r177, 31;
+ sub.s32 %r178, %r177, %r176;
+ shl.b32 %r16, %r172, %r178;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r179, %r16, %r6;
setp.lt.u32 %p15, %r179, %r3;
and.pred %p3, %p14, %p15;
- add.s32 %r180, %r16, %r17;
+ add.s32 %r180, %r15, %r16;
mul.wide.s32 %rd57, %r180, 8;
add.s64 %rd9, %rd43, %rd57;
- shr.u32 %r181, %r17, 31;
- add.s32 %r182, %r17, %r181;
- shr.s32 %r18, %r182, 1;
+ shr.u32 %r181, %r16, 31;
+ add.s32 %r182, %r16, %r181;
+ shr.s32 %r17, %r182, 1;
add.s64 %rd10, %rd51, %rd55;
- add.s32 %r183, %r16, 1;
+ add.s32 %r183, %r15, 1;
mul.wide.u32 %rd58, %r183, 8;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd12, %rd59, %rd60;
- mul.wide.s32 %rd61, %r174, 8;
+ mul.wide.s32 %rd61, %r175, 8;
add.s64 %rd13, %rd43, %rd61;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
rcp.rn.f64 %fd2, %fd1;
- mov.u32 %r316, 0;
+ mov.u32 %r314, 0;
mov.f64 %fd272, 0d0000000000000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r186, smem_ptr; }
- add.s32 %r187, %r12, %r186;
+ add.s32 %r187, %r186, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r196, smem_ptr; }
- add.s32 %r197, %r12, %r196;
+ add.s32 %r197, %r196, %r12;
not.pred %p26, %p3;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r184, %r316, %r4, %r8;
+ mad.lo.s32 %r184, %r314, %r4, %r8;
add.s32 %r185, %r184, %r11;
setp.gt.s32 %p17, %r185, 215;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r189, %r13, %r316;
+ mul.lo.s32 %r189, %r13, %r314;
cvt.s64.s32 %rd65, %r189;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd32, %rd68;
@@ -216,11 +216,11 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r190, %r14, %r316;
+ add.s32 %r190, %r14, %r314;
mad.lo.s32 %r191, %r190, %r4, %r8;
setp.lt.s32 %p19, %r191, 216;
@%p19 bra $L__BB0_13;
bra.uni $L__BB0_10;
@@ -233,38 +233,38 @@
mov.f64 %fd269, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
- add.s32 %r192, %r14, %r316;
+ add.s32 %r192, %r14, %r314;
mad.lo.s32 %r193, %r192, %r4, %r8;
setp.gt.s32 %p20, %r193, 215;
@%p20 bra $L__BB0_14;
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
$L__BB0_14:
- add.s32 %r194, %r14, %r316;
- mad.lo.s32 %r22, %r194, %r4, %r8;
+ add.s32 %r194, %r14, %r314;
+ mad.lo.s32 %r21, %r194, %r4, %r8;
add.f64 %fd281, %fd281, %fd269;
add.f64 %fd280, %fd280, %fd268;
- setp.gt.s32 %p21, %r22, 215;
+ setp.gt.s32 %p21, %r21, 215;
mov.f64 %fd270, 0d0000000000000000;
@%p21 bra $L__BB0_16;
- mul.lo.s32 %r195, %r22, %r119;
+ mul.lo.s32 %r195, %r21, %r118;
mul.wide.s32 %rd69, %r195, 8;
add.s64 %rd70, %rd16, %rd69;
ld.global.f64 %fd270, [%rd70];
$L__BB0_16:
- setp.lt.s32 %p22, %r22, 216;
+ setp.lt.s32 %p22, %r21, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_18;
- mul.lo.s32 %r199, %r13, %r316;
+ mul.lo.s32 %r199, %r13, %r314;
cvt.s64.s32 %rd73, %r199;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd33, %rd76;
@@ -277,18 +277,18 @@
}
$L__BB0_18:
- add.s32 %r315, %r14, %r316;
- mad.lo.s32 %r314, %r315, %r4, %r8;
- setp.gt.s32 %p117, %r314, 215;
+ add.s32 %r313, %r14, %r314;
+ mad.lo.s32 %r312, %r313, %r4, %r8;
+ setp.gt.s32 %p117, %r312, 215;
mov.f64 %fd274, 0d0000000000000000;
mov.f64 %fd271, %fd274;
@%p117 bra $L__BB0_20;
- mul.lo.s32 %r200, %r22, %r123;
+ mul.lo.s32 %r200, %r21, %r122;
mul.wide.s32 %rd77, %r200, 8;
add.s64 %rd78, %rd17, %rd77;
ld.global.f64 %fd271, [%rd78];
$L__BB0_20:
@@ -324,33 +324,33 @@
ld.shared.f64 %fd124, [%rd8];
add.f64 %fd125, %fd123, %fd124;
st.shared.f64 [%rd8], %fd125;
$L__BB0_24:
- setp.lt.s32 %p27, %r17, 4;
+ setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_29;
- mov.u32 %r317, %r18;
+ mov.u32 %r315, %r17;
$L__BB0_26:
- setp.ge.u32 %p28, %r6, %r317;
+ setp.ge.u32 %p28, %r6, %r315;
@%p28 bra $L__BB0_28;
- add.s32 %r201, %r317, %r16;
+ add.s32 %r201, %r315, %r15;
mul.wide.s32 %rd79, %r201, 8;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f64 %fd126, [%rd8];
ld.shared.f64 %fd127, [%rd81];
add.f64 %fd128, %fd127, %fd126;
st.shared.f64 [%rd8], %fd128;
$L__BB0_28:
bar.sync 0;
- shr.u32 %r24, %r317, 1;
- setp.gt.u32 %p29, %r317, 3;
- mov.u32 %r317, %r24;
+ shr.u32 %r23, %r315, 1;
+ setp.gt.u32 %p29, %r315, 3;
+ mov.u32 %r315, %r23;
@%p29 bra $L__BB0_26;
$L__BB0_29:
setp.ne.s32 %p30, %r6, 0;
mov.f64 %fd276, 0d0000000000000000;
@@ -374,33 +374,33 @@
ld.shared.f64 %fd133, [%rd8];
add.f64 %fd134, %fd132, %fd133;
st.shared.f64 [%rd8], %fd134;
$L__BB0_34:
- setp.lt.s32 %p118, %r17, 4;
+ setp.lt.s32 %p118, %r16, 4;
bar.sync 0;
@%p118 bra $L__BB0_39;
- mov.u32 %r318, %r18;
+ mov.u32 %r316, %r17;
$L__BB0_36:
- setp.ge.u32 %p34, %r6, %r318;
+ setp.ge.u32 %p34, %r6, %r316;
@%p34 bra $L__BB0_38;
- add.s32 %r202, %r318, %r16;
+ add.s32 %r202, %r316, %r15;
mul.wide.s32 %rd82, %r202, 8;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f64 %fd135, [%rd8];
ld.shared.f64 %fd136, [%rd84];
add.f64 %fd137, %fd136, %fd135;
st.shared.f64 [%rd8], %fd137;
$L__BB0_38:
bar.sync 0;
- shr.u32 %r26, %r318, 1;
- setp.gt.u32 %p35, %r318, 3;
- mov.u32 %r318, %r26;
+ shr.u32 %r25, %r316, 1;
+ setp.gt.u32 %p35, %r316, 3;
+ mov.u32 %r316, %r25;
@%p35 bra $L__BB0_36;
$L__BB0_39:
mov.f64 %fd277, 0d0000000000000000;
@%p30 bra $L__BB0_42;
@@ -453,54 +453,53 @@
sub.f64 %fd166, %fd163, %fd33;
mul.f64 %fd167, %fd34, %fd165;
sub.f64 %fd168, %fd166, %fd167;
mul.f64 %fd169, %fd141, %fd168;
mov.b64 %rd87, %fd169;
- mad.lo.s32 %r207, %r316, %r4, %r11;
- mad.lo.s32 %r208, %r207, %r110, %r15;
- mul.wide.s32 %rd88, %r208, 8;
+ mad.lo.s32 %r207, %r21, %r109, %r7;
+ mul.wide.s32 %rd88, %r207, 8;
add.s64 %rd85, %rd37, %rd88;
mov.b64 {%r203, %r204}, %rd86;
mov.b64 {%r205, %r206}, %rd87;
st.global.cs.v4.s32 [%rd85], {%r203,%r204,%r205,%r206};
$L__BB0_48:
- add.s32 %r316, %r316, 1;
- setp.lt.s32 %p41, %r316, %r10;
+ add.s32 %r314, %r314, 1;
+ setp.lt.s32 %p41, %r314, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_49;
$L__BB0_3:
mov.f64 %fd272, 0d0000000000000000;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_49:
- mov.u32 %r209, %tid.z;
- mad.lo.s32 %r210, %r4, %r209, %r8;
- mad.lo.s32 %r28, %r210, %r3, %r6;
- mul.wide.u32 %rd89, %r28, 8;
+ mov.u32 %r208, %tid.z;
+ mad.lo.s32 %r209, %r4, %r208, %r8;
+ mad.lo.s32 %r27, %r209, %r3, %r6;
+ mul.wide.u32 %rd89, %r27, 8;
add.s64 %rd23, %rd43, %rd89;
- clz.b32 %r211, %r4;
- mov.u32 %r212, 31;
- sub.s32 %r213, %r212, %r211;
- mov.u32 %r214, 1;
- shl.b32 %r29, %r214, %r213;
- setp.lt.u32 %p42, %r8, %r29;
- add.s32 %r215, %r29, %r8;
- setp.lt.u32 %p43, %r215, %r4;
+ clz.b32 %r210, %r4;
+ mov.u32 %r211, 31;
+ sub.s32 %r212, %r211, %r210;
+ mov.u32 %r213, 1;
+ shl.b32 %r28, %r213, %r212;
+ setp.lt.u32 %p42, %r8, %r28;
+ add.s32 %r214, %r28, %r8;
+ setp.lt.u32 %p43, %r214, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r216, %r3, %r213;
- add.s32 %r217, %r28, %r216;
- mul.wide.s32 %rd91, %r217, 8;
+ shl.b32 %r215, %r3, %r212;
+ add.s32 %r216, %r27, %r215;
+ mul.wide.s32 %rd91, %r216, 8;
add.s64 %rd24, %rd43, %rd91;
- shr.u32 %r218, %r29, 31;
- add.s32 %r219, %r29, %r218;
- shr.s32 %r322, %r219, 1;
+ shr.u32 %r217, %r28, 31;
+ add.s32 %r218, %r28, %r217;
+ shr.s32 %r320, %r218, 1;
st.shared.f64 [%rd23], %fd272;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_51;
@@ -508,38 +507,38 @@
ld.shared.f64 %fd171, [%rd23];
add.f64 %fd172, %fd170, %fd171;
st.shared.f64 [%rd23], %fd172;
$L__BB0_51:
- setp.lt.s32 %p45, %r29, 4;
+ setp.lt.s32 %p45, %r28, 4;
bar.sync 0;
@%p45 bra $L__BB0_56;
- mov.u32 %r319, %r322;
+ mov.u32 %r317, %r320;
$L__BB0_53:
- setp.ge.u32 %p46, %r8, %r319;
+ setp.ge.u32 %p46, %r8, %r317;
@%p46 bra $L__BB0_55;
- mad.lo.s32 %r220, %r319, %r3, %r28;
- mul.wide.s32 %rd92, %r220, 8;
+ mad.lo.s32 %r219, %r317, %r3, %r27;
+ mul.wide.s32 %rd92, %r219, 8;
add.s64 %rd94, %rd43, %rd92;
ld.shared.f64 %fd173, [%rd23];
ld.shared.f64 %fd174, [%rd94];
add.f64 %fd175, %fd174, %fd173;
st.shared.f64 [%rd23], %fd175;
$L__BB0_55:
bar.sync 0;
- shr.u32 %r32, %r319, 1;
- setp.gt.u32 %p47, %r319, 3;
- mov.u32 %r319, %r32;
+ shr.u32 %r31, %r317, 1;
+ setp.gt.u32 %p47, %r317, 3;
+ mov.u32 %r317, %r31;
@%p47 bra $L__BB0_53;
$L__BB0_56:
- add.s32 %r221, %r28, %r3;
- mul.wide.u32 %rd95, %r221, 8;
+ add.s32 %r220, %r27, %r3;
+ mul.wide.u32 %rd95, %r220, 8;
add.s64 %rd25, %rd43, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.f64 %fd282, 0d0000000000000000;
@%p48 bra $L__BB0_59;
@@ -564,29 +563,29 @@
$L__BB0_61:
bar.sync 0;
@%p45 bra $L__BB0_66;
- mov.u32 %r320, %r322;
+ mov.u32 %r318, %r320;
$L__BB0_63:
- setp.ge.u32 %p52, %r8, %r320;
+ setp.ge.u32 %p52, %r8, %r318;
@%p52 bra $L__BB0_65;
- mad.lo.s32 %r222, %r320, %r3, %r28;
- mul.wide.s32 %rd97, %r222, 8;
+ mad.lo.s32 %r221, %r318, %r3, %r27;
+ mul.wide.s32 %rd97, %r221, 8;
add.s64 %rd99, %rd43, %rd97;
ld.shared.f64 %fd182, [%rd23];
ld.shared.f64 %fd183, [%rd99];
add.f64 %fd184, %fd183, %fd182;
st.shared.f64 [%rd23], %fd184;
$L__BB0_65:
bar.sync 0;
- shr.u32 %r34, %r320, 1;
- setp.gt.u32 %p53, %r320, 3;
- mov.u32 %r320, %r34;
+ shr.u32 %r33, %r318, 1;
+ setp.gt.u32 %p53, %r318, 3;
+ mov.u32 %r318, %r33;
@%p53 bra $L__BB0_63;
$L__BB0_66:
mov.f64 %fd283, 0d0000000000000000;
@%p48 bra $L__BB0_69;
@@ -612,29 +611,29 @@
$L__BB0_71:
bar.sync 0;
@%p45 bra $L__BB0_76;
- mov.u32 %r321, %r322;
+ mov.u32 %r319, %r320;
$L__BB0_73:
- setp.ge.u32 %p58, %r8, %r321;
+ setp.ge.u32 %p58, %r8, %r319;
@%p58 bra $L__BB0_75;
- mad.lo.s32 %r223, %r321, %r3, %r28;
- mul.wide.s32 %rd100, %r223, 8;
+ mad.lo.s32 %r222, %r319, %r3, %r27;
+ mul.wide.s32 %rd100, %r222, 8;
add.s64 %rd102, %rd43, %rd100;
ld.shared.f64 %fd191, [%rd23];
ld.shared.f64 %fd192, [%rd102];
add.f64 %fd193, %fd192, %fd191;
st.shared.f64 [%rd23], %fd193;
$L__BB0_75:
bar.sync 0;
- shr.u32 %r36, %r321, 1;
- setp.gt.u32 %p59, %r321, 3;
- mov.u32 %r321, %r36;
+ shr.u32 %r35, %r319, 1;
+ setp.gt.u32 %p59, %r319, 3;
+ mov.u32 %r319, %r35;
@%p59 bra $L__BB0_73;
$L__BB0_76:
mov.f64 %fd284, 0d0000000000000000;
@%p48 bra $L__BB0_79;
@@ -661,26 +660,26 @@
$L__BB0_81:
bar.sync 0;
@%p45 bra $L__BB0_85;
$L__BB0_82:
- setp.ge.u32 %p64, %r8, %r322;
+ setp.ge.u32 %p64, %r8, %r320;
@%p64 bra $L__BB0_84;
- mad.lo.s32 %r224, %r322, %r3, %r28;
- mul.wide.s32 %rd103, %r224, 8;
+ mad.lo.s32 %r223, %r320, %r3, %r27;
+ mul.wide.s32 %rd103, %r223, 8;
add.s64 %rd105, %rd43, %rd103;
ld.shared.f64 %fd200, [%rd23];
ld.shared.f64 %fd201, [%rd105];
add.f64 %fd202, %fd201, %fd200;
st.shared.f64 [%rd23], %fd202;
$L__BB0_84:
bar.sync 0;
- shr.u32 %r38, %r322, 1;
- setp.gt.u32 %p65, %r322, 3;
- mov.u32 %r322, %r38;
+ shr.u32 %r37, %r320, 1;
+ setp.gt.u32 %p65, %r320, 3;
+ mov.u32 %r320, %r37;
@%p65 bra $L__BB0_82;
$L__BB0_85:
mov.f64 %fd285, 0d0000000000000000;
@%p48 bra $L__BB0_88;
@@ -699,151 +698,150 @@
bar.sync 0;
@%p115 bra $L__BB0_89;
bra.uni $L__BB0_90;
$L__BB0_89:
- shl.b32 %r313, %r6, 1;
- mov.u32 %r233, %ctaid.y;
- mad.lo.s32 %r234, %r110, %r233, %r313;
- mul.wide.s32 %rd108, %r234, 8;
+ mov.u32 %r232, %ctaid.y;
+ mad.lo.s32 %r233, %r109, %r232, %r7;
+ mul.wide.s32 %rd108, %r233, 8;
add.s64 %rd106, %rd40, %rd108;
mov.b64 %rd109, %fd282;
- mov.b64 {%r225, %r226}, %rd109;
+ mov.b64 {%r224, %r225}, %rd109;
mov.b64 %rd110, %fd283;
- mov.b64 {%r227, %r228}, %rd110;
-
- st.volatile.global.v4.s32 [%rd106], {%r225,%r226,%r227,%r228};
+ mov.b64 {%r226, %r227}, %rd110;
+
+ st.volatile.global.v4.s32 [%rd106], {%r224,%r225,%r226,%r227};
add.s64 %rd107, %rd41, %rd108;
mov.b64 %rd111, %fd284;
- mov.b64 {%r229, %r230}, %rd111;
+ mov.b64 {%r228, %r229}, %rd111;
mov.b64 %rd112, %fd285;
- mov.b64 {%r231, %r232}, %rd112;
-
- st.volatile.global.v4.s32 [%rd107], {%r229,%r230,%r231,%r232};
+ mov.b64 {%r230, %r231}, %rd112;
+
+ st.volatile.global.v4.s32 [%rd107], {%r228,%r229,%r230,%r231};
$L__BB0_90:
- mov.u32 %r39, %ctaid.y;
+ mov.u32 %r38, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r235, %r6, %r8;
- or.b32 %r237, %r235, %r209;
- setp.ne.s32 %p68, %r237, 0;
+ or.b32 %r234, %r6, %r8;
+ or.b32 %r236, %r234, %r208;
+ setp.ne.s32 %p68, %r236, 0;
@%p68 bra $L__BB0_94;
ld.param.u64 %rd157, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd157;
- mov.u32 %r238, %ctaid.x;
- mov.u32 %r239, %ctaid.z;
- mov.u32 %r240, %nctaid.x;
- mad.lo.s32 %r241, %r239, %r240, %r238;
- mul.wide.s32 %rd114, %r241, 8;
+ mov.u32 %r237, %ctaid.x;
+ mov.u32 %r238, %ctaid.z;
+ mov.u32 %r239, %nctaid.x;
+ mad.lo.s32 %r240, %r238, %r239, %r237;
+ mul.wide.s32 %rd114, %r240, 8;
add.s64 %rd28, %rd113, %rd114;
- add.s32 %r242, %r9, -1;
- setp.eq.s32 %p69, %r39, %r242;
+ add.s32 %r241, %r9, -1;
+ setp.eq.s32 %p69, %r38, %r241;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p69;
atom.global.add.u64 %rd29, [%rd28], %rd118;
ld.volatile.global.u64 %rd119, [%rd28];
xor.b64 %rd120, %rd119, %rd29;
setp.lt.s64 %p70, %rd120, 0;
@%p70 bra $L__BB0_94;
- mov.u32 %r323, 8;
+ mov.u32 %r321, 8;
$L__BB0_93:
- nanosleep.u32 %r323;
-
- setp.lt.u32 %p71, %r323, 256;
- selp.u32 %r245, 1, 0, %p71;
- shl.b32 %r323, %r323, %r245;
+ nanosleep.u32 %r321;
+
+ setp.lt.u32 %p71, %r321, 256;
+ selp.u32 %r244, 1, 0, %p71;
+ shl.b32 %r321, %r321, %r244;
ld.volatile.global.u64 %rd121, [%rd28];
xor.b64 %rd122, %rd121, %rd29;
setp.gt.s64 %p72, %rd122, -1;
@%p72 bra $L__BB0_93;
$L__BB0_94:
bar.sync 0;
- add.s32 %r246, %r9, %r3;
- add.s32 %r247, %r246, -1;
- div.s32 %r42, %r247, %r3;
- setp.lt.s32 %p73, %r42, 1;
+ add.s32 %r245, %r9, %r3;
+ add.s32 %r246, %r245, -1;
+ div.s32 %r41, %r246, %r3;
+ setp.lt.s32 %p73, %r41, 1;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd291, %fd290;
@%p73 bra $L__BB0_100;
- add.s32 %r249, %r4, %r2;
- add.s32 %r250, %r249, -1;
- shl.b32 %r251, %r8, 1;
- shl.b32 %r252, %r4, 1;
- mad.lo.s32 %r253, %r252, %r39, %r251;
- or.b32 %r254, %r253, 1;
- setp.ge.s32 %p74, %r254, %r110;
- div.s32 %r255, %r250, %r4;
- setp.ge.s32 %p75, %r39, %r255;
+ add.s32 %r248, %r4, %r2;
+ add.s32 %r249, %r248, -1;
+ shl.b32 %r250, %r8, 1;
+ shl.b32 %r251, %r4, 1;
+ mad.lo.s32 %r252, %r251, %r38, %r250;
+ or.b32 %r253, %r252, 1;
+ setp.ge.s32 %p74, %r253, %r109;
+ div.s32 %r254, %r249, %r4;
+ setp.ge.s32 %p75, %r38, %r254;
or.pred %p6, %p75, %p74;
- mul.lo.s32 %r256, %r4, %r39;
- shl.b32 %r257, %r256, 1;
- mad.lo.s32 %r258, %r110, %r6, %r257;
- add.s32 %r325, %r258, %r251;
- mul.lo.s32 %r44, %r110, %r3;
- mov.u32 %r326, 0;
+ mul.lo.s32 %r255, %r4, %r38;
+ shl.b32 %r256, %r255, 1;
+ mad.lo.s32 %r257, %r109, %r6, %r256;
+ add.s32 %r323, %r257, %r250;
+ mul.lo.s32 %r43, %r109, %r3;
+ mov.u32 %r324, 0;
mov.f64 %fd209, 0d0000000000000000;
- mov.u32 %r324, %r6;
+ mov.u32 %r322, %r6;
mov.f64 %fd290, %fd209;
mov.f64 %fd291, %fd209;
$L__BB0_96:
.pragma "nounroll";
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p6 bra $L__BB0_99;
- setp.ge.s32 %p76, %r324, %r9;
+ setp.ge.s32 %p76, %r322, %r9;
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p76 bra $L__BB0_99;
- mul.wide.s32 %rd124, %r325, 8;
+ mul.wide.s32 %rd124, %r323, 8;
add.s64 %rd123, %rd41, %rd124;
- ld.volatile.global.v4.s32 {%r259,%r260,%r261,%r262}, [%rd123];
-
- mov.b64 %rd125, {%r259, %r260};
+ ld.volatile.global.v4.s32 {%r258,%r259,%r260,%r261}, [%rd123];
+
+ mov.b64 %rd125, {%r258, %r259};
mov.b64 %fd289, %rd125;
- mov.b64 %rd126, {%r261, %r262};
+ mov.b64 %rd126, {%r260, %r261};
mov.b64 %fd288, %rd126;
$L__BB0_99:
add.f64 %fd290, %fd290, %fd289;
add.f64 %fd291, %fd291, %fd288;
- add.s32 %r325, %r325, %r44;
- add.s32 %r324, %r324, %r3;
- add.s32 %r326, %r326, 1;
- setp.lt.s32 %p77, %r326, %r42;
+ add.s32 %r323, %r323, %r43;
+ add.s32 %r322, %r322, %r3;
+ add.s32 %r324, %r324, 1;
+ setp.lt.s32 %p77, %r324, %r41;
@%p77 bra $L__BB0_96;
$L__BB0_100:
- clz.b32 %r263, %r3;
- mov.u32 %r264, 31;
- sub.s32 %r265, %r264, %r263;
- mov.u32 %r266, 1;
- shl.b32 %r51, %r266, %r265;
- setp.lt.u32 %p78, %r6, %r51;
- add.s32 %r267, %r51, %r6;
- setp.lt.u32 %p79, %r267, %r3;
+ clz.b32 %r262, %r3;
+ mov.u32 %r263, 31;
+ sub.s32 %r264, %r263, %r262;
+ mov.u32 %r265, 1;
+ shl.b32 %r50, %r265, %r264;
+ setp.lt.u32 %p78, %r6, %r50;
+ add.s32 %r266, %r50, %r6;
+ setp.lt.u32 %p79, %r266, %r3;
and.pred %p7, %p78, %p79;
- add.s32 %r268, %r28, %r51;
- mul.wide.s32 %rd127, %r268, 8;
+ add.s32 %r267, %r27, %r50;
+ mul.wide.s32 %rd127, %r267, 8;
add.s64 %rd30, %rd43, %rd127;
- shr.u32 %r269, %r51, 31;
- add.s32 %r270, %r51, %r269;
- shr.s32 %r333, %r270, 1;
+ shr.u32 %r268, %r50, 31;
+ add.s32 %r269, %r50, %r268;
+ shr.s32 %r331, %r269, 1;
st.shared.f64 [%rd23], %fd290;
bar.sync 0;
not.pred %p80, %p7;
@%p80 bra $L__BB0_102;
@@ -851,38 +849,38 @@
ld.shared.f64 %fd215, [%rd23];
add.f64 %fd216, %fd214, %fd215;
st.shared.f64 [%rd23], %fd216;
$L__BB0_102:
- setp.lt.s32 %p81, %r51, 4;
+ setp.lt.s32 %p81, %r50, 4;
bar.sync 0;
@%p81 bra $L__BB0_107;
- mov.u32 %r327, %r333;
+ mov.u32 %r325, %r331;
$L__BB0_104:
- setp.ge.u32 %p82, %r6, %r327;
+ setp.ge.u32 %p82, %r6, %r325;
@%p82 bra $L__BB0_106;
- add.s32 %r271, %r327, %r28;
- mul.wide.s32 %rd129, %r271, 8;
+ add.s32 %r270, %r325, %r27;
+ mul.wide.s32 %rd129, %r270, 8;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f64 %fd217, [%rd23];
ld.shared.f64 %fd218, [%rd131];
add.f64 %fd219, %fd218, %fd217;
st.shared.f64 [%rd23], %fd219;
$L__BB0_106:
bar.sync 0;
- shr.u32 %r54, %r327, 1;
- setp.gt.u32 %p83, %r327, 3;
- mov.u32 %r327, %r54;
+ shr.u32 %r53, %r325, 1;
+ setp.gt.u32 %p83, %r325, 3;
+ mov.u32 %r325, %r53;
@%p83 bra $L__BB0_104;
$L__BB0_107:
- add.s32 %r272, %r28, 1;
- mul.wide.u32 %rd132, %r272, 8;
+ add.s32 %r271, %r27, 1;
+ mul.wide.u32 %rd132, %r271, 8;
add.s64 %rd31, %rd43, %rd132;
setp.ne.s32 %p84, %r6, 0;
mov.f64 %fd292, 0d0000000000000000;
@%p84 bra $L__BB0_110;
@@ -907,29 +905,29 @@
$L__BB0_112:
bar.sync 0;
@%p81 bra $L__BB0_117;
- mov.u32 %r328, %r333;
+ mov.u32 %r326, %r331;
$L__BB0_114:
- setp.ge.u32 %p88, %r6, %r328;
+ setp.ge.u32 %p88, %r6, %r326;
@%p88 bra $L__BB0_116;
- add.s32 %r273, %r328, %r28;
- mul.wide.s32 %rd134, %r273, 8;
+ add.s32 %r272, %r326, %r27;
+ mul.wide.s32 %rd134, %r272, 8;
add.s64 %rd136, %rd43, %rd134;
ld.shared.f64 %fd226, [%rd23];
ld.shared.f64 %fd227, [%rd136];
add.f64 %fd228, %fd227, %fd226;
st.shared.f64 [%rd23], %fd228;
$L__BB0_116:
bar.sync 0;
- shr.u32 %r56, %r328, 1;
- setp.gt.u32 %p89, %r328, 3;
- mov.u32 %r328, %r56;
+ shr.u32 %r55, %r326, 1;
+ setp.gt.u32 %p89, %r326, 3;
+ mov.u32 %r326, %r55;
@%p89 bra $L__BB0_114;
$L__BB0_117:
mov.f64 %fd293, 0d0000000000000000;
@%p84 bra $L__BB0_120;
@@ -944,90 +942,90 @@
$L__BB0_120:
bar.sync 0;
@%p84 bra $L__BB0_124;
- add.s32 %r274, %r4, %r2;
- add.s32 %r275, %r274, -1;
- div.s32 %r276, %r275, %r4;
- setp.ge.s32 %p93, %r39, %r276;
+ add.s32 %r273, %r4, %r2;
+ add.s32 %r274, %r273, -1;
+ div.s32 %r275, %r274, %r4;
+ setp.ge.s32 %p93, %r38, %r275;
@%p93 bra $L__BB0_124;
- shl.b32 %r57, %r8, 1;
- mul.lo.s32 %r277, %r4, %r39;
- shl.b32 %r58, %r277, 1;
- add.s32 %r278, %r57, %r58;
- or.b32 %r279, %r278, 1;
- setp.ge.s32 %p94, %r279, %r110;
+ shl.b32 %r56, %r8, 1;
+ mul.lo.s32 %r276, %r4, %r38;
+ shl.b32 %r57, %r276, 1;
+ add.s32 %r277, %r56, %r57;
+ or.b32 %r278, %r277, 1;
+ setp.ge.s32 %p94, %r278, %r109;
@%p94 bra $L__BB0_124;
ld.param.u64 %rd156, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r284, %r58, %r57;
- mul.wide.s32 %rd138, %r284, 8;
+ add.s32 %r283, %r57, %r56;
+ mul.wide.s32 %rd138, %r283, 8;
add.s64 %rd137, %rd156, %rd138;
mov.b64 %rd139, %fd292;
- mov.b64 {%r280, %r281}, %rd139;
+ mov.b64 {%r279, %r280}, %rd139;
mov.b64 %rd140, %fd293;
- mov.b64 {%r282, %r283}, %rd140;
-
- st.global.cs.v4.s32 [%rd137], {%r280,%r281,%r282,%r283};
+ mov.b64 {%r281, %r282}, %rd140;
+
+ st.global.cs.v4.s32 [%rd137], {%r279,%r280,%r281,%r282};
$L__BB0_124:
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd299, %fd298;
@%p73 bra $L__BB0_130;
- add.s32 %r286, %r4, %r2;
- add.s32 %r287, %r286, -1;
- shl.b32 %r288, %r8, 1;
- shl.b32 %r289, %r4, 1;
- mad.lo.s32 %r290, %r289, %r39, %r288;
- or.b32 %r291, %r290, 1;
- setp.ge.s32 %p96, %r291, %r110;
- div.s32 %r292, %r287, %r4;
- setp.ge.s32 %p97, %r39, %r292;
+ add.s32 %r285, %r4, %r2;
+ add.s32 %r286, %r285, -1;
+ shl.b32 %r287, %r8, 1;
+ shl.b32 %r288, %r4, 1;
+ mad.lo.s32 %r289, %r288, %r38, %r287;
+ or.b32 %r290, %r289, 1;
+ setp.ge.s32 %p96, %r290, %r109;
+ div.s32 %r291, %r286, %r4;
+ setp.ge.s32 %p97, %r38, %r291;
or.pred %p8, %p97, %p96;
- mul.lo.s32 %r293, %r4, %r39;
- shl.b32 %r294, %r293, 1;
- mad.lo.s32 %r295, %r110, %r6, %r294;
- add.s32 %r330, %r295, %r288;
- mul.lo.s32 %r60, %r110, %r3;
- mov.u32 %r331, 0;
+ mul.lo.s32 %r292, %r4, %r38;
+ shl.b32 %r293, %r292, 1;
+ mad.lo.s32 %r294, %r109, %r6, %r293;
+ add.s32 %r328, %r294, %r287;
+ mul.lo.s32 %r59, %r109, %r3;
+ mov.u32 %r329, 0;
mov.f64 %fd235, 0d0000000000000000;
- mov.u32 %r329, %r6;
+ mov.u32 %r327, %r6;
mov.f64 %fd298, %fd235;
mov.f64 %fd299, %fd235;
$L__BB0_126:
.pragma "nounroll";
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p8 bra $L__BB0_129;
- setp.ge.s32 %p98, %r329, %r9;
+ setp.ge.s32 %p98, %r327, %r9;
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p98 bra $L__BB0_129;
- mul.wide.s32 %rd142, %r330, 8;
+ mul.wide.s32 %rd142, %r328, 8;
add.s64 %rd141, %rd40, %rd142;
- ld.volatile.global.v4.s32 {%r296,%r297,%r298,%r299}, [%rd141];
-
- mov.b64 %rd143, {%r296, %r297};
+ ld.volatile.global.v4.s32 {%r295,%r296,%r297,%r298}, [%rd141];
+
+ mov.b64 %rd143, {%r295, %r296};
mov.b64 %fd297, %rd143;
- mov.b64 %rd144, {%r298, %r299};
+ mov.b64 %rd144, {%r297, %r298};
mov.b64 %fd296, %rd144;
$L__BB0_129:
add.f64 %fd298, %fd298, %fd297;
add.f64 %fd299, %fd299, %fd296;
- add.s32 %r330, %r330, %r60;
- add.s32 %r329, %r329, %r3;
- add.s32 %r331, %r331, 1;
- setp.lt.s32 %p99, %r331, %r42;
+ add.s32 %r328, %r328, %r59;
+ add.s32 %r327, %r327, %r3;
+ add.s32 %r329, %r329, 1;
+ setp.lt.s32 %p99, %r329, %r41;
@%p99 bra $L__BB0_126;
$L__BB0_130:
st.shared.f64 [%rd23], %fd298;
bar.sync 0;
@@ -1040,29 +1038,29 @@
$L__BB0_132:
bar.sync 0;
@%p81 bra $L__BB0_137;
- mov.u32 %r332, %r333;
+ mov.u32 %r330, %r331;
$L__BB0_134:
- setp.ge.u32 %p102, %r6, %r332;
+ setp.ge.u32 %p102, %r6, %r330;
@%p102 bra $L__BB0_136;
- add.s32 %r300, %r332, %r28;
- mul.wide.s32 %rd145, %r300, 8;
+ add.s32 %r299, %r330, %r27;
+ mul.wide.s32 %rd145, %r299, 8;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f64 %fd243, [%rd23];
ld.shared.f64 %fd244, [%rd147];
add.f64 %fd245, %fd244, %fd243;
st.shared.f64 [%rd23], %fd245;
$L__BB0_136:
bar.sync 0;
- shr.u32 %r68, %r332, 1;
- setp.gt.u32 %p103, %r332, 3;
- mov.u32 %r332, %r68;
+ shr.u32 %r67, %r330, 1;
+ setp.gt.u32 %p103, %r330, 3;
+ mov.u32 %r330, %r67;
@%p103 bra $L__BB0_134;
$L__BB0_137:
mov.f64 %fd300, 0d0000000000000000;
@%p84 bra $L__BB0_140;
@@ -1089,26 +1087,26 @@
$L__BB0_142:
bar.sync 0;
@%p81 bra $L__BB0_146;
$L__BB0_143:
- setp.ge.u32 %p108, %r6, %r333;
+ setp.ge.u32 %p108, %r6, %r331;
@%p108 bra $L__BB0_145;
- add.s32 %r301, %r333, %r28;
- mul.wide.s32 %rd148, %r301, 8;
+ add.s32 %r300, %r331, %r27;
+ mul.wide.s32 %rd148, %r300, 8;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f64 %fd252, [%rd23];
ld.shared.f64 %fd253, [%rd150];
add.f64 %fd254, %fd253, %fd252;
st.shared.f64 [%rd23], %fd254;
$L__BB0_145:
bar.sync 0;
- shr.u32 %r70, %r333, 1;
- setp.gt.u32 %p109, %r333, 3;
- mov.u32 %r333, %r70;
+ shr.u32 %r69, %r331, 1;
+ setp.gt.u32 %p109, %r331, 3;
+ mov.u32 %r331, %r69;
@%p109 bra $L__BB0_143;
$L__BB0_146:
mov.f64 %fd301, 0d0000000000000000;
@%p84 bra $L__BB0_149;
@@ -1123,34 +1121,34 @@
$L__BB0_149:
bar.sync 0;
@%p84 bra $L__BB0_153;
- add.s32 %r302, %r4, %r2;
- add.s32 %r303, %r302, -1;
- div.s32 %r304, %r303, %r4;
- setp.ge.s32 %p113, %r39, %r304;
+ add.s32 %r301, %r4, %r2;
+ add.s32 %r302, %r301, -1;
+ div.s32 %r303, %r302, %r4;
+ setp.ge.s32 %p113, %r38, %r303;
@%p113 bra $L__BB0_153;
- shl.b32 %r71, %r8, 1;
- mul.lo.s32 %r305, %r4, %r39;
- shl.b32 %r72, %r305, 1;
- add.s32 %r306, %r71, %r72;
- or.b32 %r307, %r306, 1;
- setp.ge.s32 %p114, %r307, %r110;
+ shl.b32 %r70, %r8, 1;
+ mul.lo.s32 %r304, %r4, %r38;
+ shl.b32 %r71, %r304, 1;
+ add.s32 %r305, %r70, %r71;
+ or.b32 %r306, %r305, 1;
+ setp.ge.s32 %p114, %r306, %r109;
@%p114 bra $L__BB0_153;
ld.param.u64 %rd155, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r312, %r72, %r71;
- mul.wide.s32 %rd152, %r312, 8;
+ add.s32 %r311, %r71, %r70;
+ mul.wide.s32 %rd152, %r311, 8;
add.s64 %rd151, %rd155, %rd152;
mov.b64 %rd153, %fd300;
- mov.b64 {%r308, %r309}, %rd153;
+ mov.b64 {%r307, %r308}, %rd153;
mov.b64 %rd154, %fd301;
- mov.b64 {%r310, %r311}, %rd154;
-
- st.global.cs.v4.s32 [%rd151], {%r308,%r309,%r310,%r311};
+ mov.b64 {%r309, %r310}, %rd154;
+
+ st.global.cs.v4.s32 [%rd151], {%r307,%r308,%r309,%r310};
$L__BB0_153:
ret;
2: CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_96
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 54
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103395arrayE[];
.entry _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<119>;
.reg .b32 %r<334>;
.reg .f64 %fd<302>;
.reg .b64 %rd<158>;
ld.param.v2.u32 {%r109, %r110}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r119, %r120}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r123, %r124}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r145, %r110, 1;
shr.u32 %r146, %r145, 31;
add.s32 %r147, %r145, %r146;
shr.s32 %r2, %r147, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r148, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r149, %r4, 3;
mad.lo.s32 %r150, %r149, %r148, 15;
and.b32 %r151, %r150, -16;
cvt.u64.u32 %rd1, %r151;
mul.lo.s32 %r152, %r4, %r2;
shl.b32 %r153, %r152, 4;
or.b32 %r154, %r153, 15;
and.b32 %r5, %r154, -16;
add.s32 %r155, %r154, %r5;
and.b32 %r156, %r155, -16;
cvt.s64.s32 %rd2, %r156;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
cvt.rn.f64.s32 %fd1, %r110;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r157, %r7, 1;
setp.lt.s32 %p10, %r157, %r110;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r158, smem_ptr; }
// end inline asm
shl.b32 %r161, %r6, 4;
add.s32 %r159, %r158, %r161;
mul.wide.s32 %rd47, %r7, 8;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r160, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r160, 0;
cp.async.ca.shared.global [%r159], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r162, %r4, 215;
div.s32 %r163, %r162, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r164, %r9, %r163;
add.s32 %r165, %r164, -1;
div.s32 %r10, %r165, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r167, %ctaid.y;
mul.lo.s32 %r168, %r10, %r4;
mul.lo.s32 %r11, %r168, %r167;
shl.b32 %r169, %r8, 3;
shl.b32 %r170, %r6, 4;
mad.lo.s32 %r12, %r169, %r110, %r170;
mul.lo.s32 %r171, %r110, %r8;
cvt.s64.s32 %rd52, %r171;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r172, %r11, %r110;
cvt.s64.s32 %rd6, %r172;
mul.lo.s32 %r13, %r110, %r4;
mul.lo.s32 %r14, %r10, %r167;
add.s32 %r15, %r171, %r7;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r15, 8;
add.s64 %rd7, %rd54, %rd55;
mov.u32 %r173, %tid.z;
mad.lo.s32 %r174, %r4, %r173, %r8;
mad.lo.s32 %r16, %r174, %r3, %r6;
mul.wide.u32 %rd56, %r16, 8;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r175, %r3;
mov.u32 %r176, 31;
sub.s32 %r177, %r176, %r175;
mov.u32 %r178, 1;
shl.b32 %r17, %r178, %r177;
setp.lt.u32 %p14, %r6, %r17;
add.s32 %r179, %r17, %r6;
setp.lt.u32 %p15, %r179, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r180, %r16, %r17;
mul.wide.s32 %rd57, %r180, 8;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r181, %r17, 31;
add.s32 %r182, %r17, %r181;
shr.s32 %r18, %r182, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r183, %r16, 1;
mul.wide.u32 %rd58, %r183, 8;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r174, 8;
add.s64 %rd13, %rd43, %rd61;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r316, 0;
mov.f64 %fd272, 0d0000000000000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r186, smem_ptr; }
// end inline asm
add.s32 %r187, %r12, %r186;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r196, smem_ptr; }
// end inline asm
add.s32 %r197, %r12, %r196;
not.pred %p26, %p3;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r184, %r316, %r4, %r8;
add.s32 %r185, %r184, %r11;
setp.gt.s32 %p17, %r185, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r189, %r13, %r316;
cvt.s64.s32 %rd65, %r189;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r188, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r188, 0;
cp.async.ca.shared.global [%r187], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r190, %r14, %r316;
mad.lo.s32 %r191, %r190, %r4, %r8;
setp.lt.s32 %p19, %r191, 216;
@%p19 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd268, 0d0000000000000000;
mov.f64 %fd269, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r192, %r14, %r316;
mad.lo.s32 %r193, %r192, %r4, %r8;
setp.gt.s32 %p20, %r193, 215;
@%p20 bra $L__BB0_14;
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
$L__BB0_14:
add.s32 %r194, %r14, %r316;
mad.lo.s32 %r22, %r194, %r4, %r8;
add.f64 %fd281, %fd281, %fd269;
add.f64 %fd280, %fd280, %fd268;
setp.gt.s32 %p21, %r22, 215;
mov.f64 %fd270, 0d0000000000000000;
@%p21 bra $L__BB0_16;
mul.lo.s32 %r195, %r22, %r119;
mul.wide.s32 %rd69, %r195, 8;
add.s64 %rd70, %rd16, %rd69;
ld.global.f64 %fd270, [%rd70];
$L__BB0_16:
setp.lt.s32 %p22, %r22, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_18;
mul.lo.s32 %r199, %r13, %r316;
cvt.s64.s32 %rd73, %r199;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r198, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r198, 0;
cp.async.ca.shared.global [%r197], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
add.s32 %r315, %r14, %r316;
mad.lo.s32 %r314, %r315, %r4, %r8;
setp.gt.s32 %p117, %r314, 215;
mov.f64 %fd274, 0d0000000000000000;
mov.f64 %fd271, %fd274;
@%p117 bra $L__BB0_20;
mul.lo.s32 %r200, %r22, %r123;
mul.wide.s32 %rd77, %r200, 8;
add.s64 %rd78, %rd17, %rd77;
ld.global.f64 %fd271, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd275, %fd274;
@%p23 bra $L__BB0_22;
ld.shared.v2.f64 {%fd103, %fd104}, [%rd7];
ld.shared.v2.f64 {%fd107, %fd108}, [%rd10];
ld.shared.v2.f64 {%fd111, %fd112}, [%rd12];
mul.f64 %fd115, %fd111, %fd103;
add.f64 %fd116, %fd115, 0d0000000000000000;
sub.f64 %fd117, %fd107, %fd270;
mul.f64 %fd118, %fd271, %fd117;
fma.rn.f64 %fd119, %fd115, %fd118, 0d0000000000000000;
fma.rn.f64 %fd272, %fd118, %fd103, %fd272;
mul.f64 %fd120, %fd112, %fd104;
add.f64 %fd275, %fd116, %fd120;
sub.f64 %fd121, %fd108, %fd270;
mul.f64 %fd122, %fd271, %fd121;
fma.rn.f64 %fd274, %fd120, %fd122, %fd119;
fma.rn.f64 %fd273, %fd122, %fd104, %fd273;
$L__BB0_22:
st.shared.f64 [%rd8], %fd275;
bar.sync 0;
@%p26 bra $L__BB0_24;
ld.shared.f64 %fd123, [%rd9];
ld.shared.f64 %fd124, [%rd8];
add.f64 %fd125, %fd123, %fd124;
st.shared.f64 [%rd8], %fd125;
$L__BB0_24:
setp.lt.s32 %p27, %r17, 4;
bar.sync 0;
@%p27 bra $L__BB0_29;
mov.u32 %r317, %r18;
$L__BB0_26:
setp.ge.u32 %p28, %r6, %r317;
@%p28 bra $L__BB0_28;
add.s32 %r201, %r317, %r16;
mul.wide.s32 %rd79, %r201, 8;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f64 %fd126, [%rd8];
ld.shared.f64 %fd127, [%rd81];
add.f64 %fd128, %fd127, %fd126;
st.shared.f64 [%rd8], %fd128;
$L__BB0_28:
bar.sync 0;
shr.u32 %r24, %r317, 1;
setp.gt.u32 %p29, %r317, 3;
mov.u32 %r317, %r24;
@%p29 bra $L__BB0_26;
$L__BB0_29:
setp.ne.s32 %p30, %r6, 0;
mov.f64 %fd276, 0d0000000000000000;
@%p30 bra $L__BB0_32;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f64 %fd130, [%rd8];
add.f64 %fd276, %fd130, 0d0000000000000000;
@%p31 bra $L__BB0_32;
ld.shared.f64 %fd131, [%rd11];
add.f64 %fd276, %fd276, %fd131;
$L__BB0_32:
bar.sync 0;
st.shared.f64 [%rd8], %fd274;
bar.sync 0;
@%p26 bra $L__BB0_34;
ld.shared.f64 %fd132, [%rd9];
ld.shared.f64 %fd133, [%rd8];
add.f64 %fd134, %fd132, %fd133;
st.shared.f64 [%rd8], %fd134;
$L__BB0_34:
setp.lt.s32 %p118, %r17, 4;
bar.sync 0;
@%p118 bra $L__BB0_39;
mov.u32 %r318, %r18;
$L__BB0_36:
setp.ge.u32 %p34, %r6, %r318;
@%p34 bra $L__BB0_38;
add.s32 %r202, %r318, %r16;
mul.wide.s32 %rd82, %r202, 8;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f64 %fd135, [%rd8];
ld.shared.f64 %fd136, [%rd84];
add.f64 %fd137, %fd136, %fd135;
st.shared.f64 [%rd8], %fd137;
$L__BB0_38:
bar.sync 0;
shr.u32 %r26, %r318, 1;
setp.gt.u32 %p35, %r318, 3;
mov.u32 %r318, %r26;
@%p35 bra $L__BB0_36;
$L__BB0_39:
mov.f64 %fd277, 0d0000000000000000;
@%p30 bra $L__BB0_42;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f64 %fd139, [%rd8];
add.f64 %fd277, %fd139, 0d0000000000000000;
@%p37 bra $L__BB0_42;
ld.shared.f64 %fd140, [%rd11];
add.f64 %fd277, %fd277, %fd140;
$L__BB0_42:
bar.sync 0;
@%p30 bra $L__BB0_44;
st.shared.f64 [%rd13], %fd276;
$L__BB0_44:
bar.sync 0;
ld.shared.f64 %fd33, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_46;
st.shared.f64 [%rd13], %fd277;
$L__BB0_46:
bar.sync 0;
ld.shared.f64 %fd34, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_48;
mul.f64 %fd141, %fd2, %fd271;
ld.shared.v2.f64 {%fd142, %fd143}, [%rd10];
ld.shared.v2.f64 {%fd146, %fd147}, [%rd12];
ld.shared.v2.f64 {%fd150, %fd151}, [%rd7];
mul.f64 %fd154, %fd146, %fd150;
mul.f64 %fd155, %fd154, %fd1;
sub.f64 %fd156, %fd142, %fd270;
mul.f64 %fd157, %fd271, %fd156;
sub.f64 %fd158, %fd155, %fd33;
mul.f64 %fd159, %fd34, %fd157;
sub.f64 %fd160, %fd158, %fd159;
mul.f64 %fd161, %fd141, %fd160;
mov.b64 %rd86, %fd161;
mul.f64 %fd162, %fd147, %fd151;
mul.f64 %fd163, %fd162, %fd1;
sub.f64 %fd164, %fd143, %fd270;
mul.f64 %fd165, %fd271, %fd164;
sub.f64 %fd166, %fd163, %fd33;
mul.f64 %fd167, %fd34, %fd165;
sub.f64 %fd168, %fd166, %fd167;
mul.f64 %fd169, %fd141, %fd168;
mov.b64 %rd87, %fd169;
mad.lo.s32 %r207, %r316, %r4, %r11;
mad.lo.s32 %r208, %r207, %r110, %r15;
mul.wide.s32 %rd88, %r208, 8;
add.s64 %rd85, %rd37, %rd88;
mov.b64 {%r203, %r204}, %rd86;
mov.b64 {%r205, %r206}, %rd87;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r203,%r204,%r205,%r206};
// end inline asm
$L__BB0_48:
add.s32 %r316, %r316, 1;
setp.lt.s32 %p41, %r316, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_49;
$L__BB0_3:
mov.f64 %fd272, 0d0000000000000000;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_49:
mov.u32 %r209, %tid.z;
mad.lo.s32 %r210, %r4, %r209, %r8;
mad.lo.s32 %r28, %r210, %r3, %r6;
mul.wide.u32 %rd89, %r28, 8;
add.s64 %rd23, %rd43, %rd89;
clz.b32 %r211, %r4;
mov.u32 %r212, 31;
sub.s32 %r213, %r212, %r211;
mov.u32 %r214, 1;
shl.b32 %r29, %r214, %r213;
setp.lt.u32 %p42, %r8, %r29;
add.s32 %r215, %r29, %r8;
setp.lt.u32 %p43, %r215, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r216, %r3, %r213;
add.s32 %r217, %r28, %r216;
mul.wide.s32 %rd91, %r217, 8;
add.s64 %rd24, %rd43, %rd91;
shr.u32 %r218, %r29, 31;
add.s32 %r219, %r29, %r218;
shr.s32 %r322, %r219, 1;
st.shared.f64 [%rd23], %fd272;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_51;
ld.shared.f64 %fd170, [%rd24];
ld.shared.f64 %fd171, [%rd23];
add.f64 %fd172, %fd170, %fd171;
st.shared.f64 [%rd23], %fd172;
$L__BB0_51:
setp.lt.s32 %p45, %r29, 4;
bar.sync 0;
@%p45 bra $L__BB0_56;
mov.u32 %r319, %r322;
$L__BB0_53:
setp.ge.u32 %p46, %r8, %r319;
@%p46 bra $L__BB0_55;
mad.lo.s32 %r220, %r319, %r3, %r28;
mul.wide.s32 %rd92, %r220, 8;
add.s64 %rd94, %rd43, %rd92;
ld.shared.f64 %fd173, [%rd23];
ld.shared.f64 %fd174, [%rd94];
add.f64 %fd175, %fd174, %fd173;
st.shared.f64 [%rd23], %fd175;
$L__BB0_55:
bar.sync 0;
shr.u32 %r32, %r319, 1;
setp.gt.u32 %p47, %r319, 3;
mov.u32 %r319, %r32;
@%p47 bra $L__BB0_53;
$L__BB0_56:
add.s32 %r221, %r28, %r3;
mul.wide.u32 %rd95, %r221, 8;
add.s64 %rd25, %rd43, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.f64 %fd282, 0d0000000000000000;
@%p48 bra $L__BB0_59;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f64 %fd177, [%rd23];
add.f64 %fd282, %fd177, 0d0000000000000000;
@%p49 bra $L__BB0_59;
ld.shared.f64 %fd178, [%rd25];
add.f64 %fd282, %fd282, %fd178;
$L__BB0_59:
bar.sync 0;
st.shared.f64 [%rd23], %fd273;
bar.sync 0;
@%p44 bra $L__BB0_61;
ld.shared.f64 %fd179, [%rd24];
ld.shared.f64 %fd180, [%rd23];
add.f64 %fd181, %fd179, %fd180;
st.shared.f64 [%rd23], %fd181;
$L__BB0_61:
bar.sync 0;
@%p45 bra $L__BB0_66;
mov.u32 %r320, %r322;
$L__BB0_63:
setp.ge.u32 %p52, %r8, %r320;
@%p52 bra $L__BB0_65;
mad.lo.s32 %r222, %r320, %r3, %r28;
mul.wide.s32 %rd97, %r222, 8;
add.s64 %rd99, %rd43, %rd97;
ld.shared.f64 %fd182, [%rd23];
ld.shared.f64 %fd183, [%rd99];
add.f64 %fd184, %fd183, %fd182;
st.shared.f64 [%rd23], %fd184;
$L__BB0_65:
bar.sync 0;
shr.u32 %r34, %r320, 1;
setp.gt.u32 %p53, %r320, 3;
mov.u32 %r320, %r34;
@%p53 bra $L__BB0_63;
$L__BB0_66:
mov.f64 %fd283, 0d0000000000000000;
@%p48 bra $L__BB0_69;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f64 %fd186, [%rd23];
add.f64 %fd283, %fd186, 0d0000000000000000;
@%p55 bra $L__BB0_69;
ld.shared.f64 %fd187, [%rd25];
add.f64 %fd283, %fd283, %fd187;
$L__BB0_69:
bar.sync 0;
st.shared.f64 [%rd23], %fd280;
bar.sync 0;
@%p44 bra $L__BB0_71;
ld.shared.f64 %fd188, [%rd24];
ld.shared.f64 %fd189, [%rd23];
add.f64 %fd190, %fd188, %fd189;
st.shared.f64 [%rd23], %fd190;
$L__BB0_71:
bar.sync 0;
@%p45 bra $L__BB0_76;
mov.u32 %r321, %r322;
$L__BB0_73:
setp.ge.u32 %p58, %r8, %r321;
@%p58 bra $L__BB0_75;
mad.lo.s32 %r223, %r321, %r3, %r28;
mul.wide.s32 %rd100, %r223, 8;
add.s64 %rd102, %rd43, %rd100;
ld.shared.f64 %fd191, [%rd23];
ld.shared.f64 %fd192, [%rd102];
add.f64 %fd193, %fd192, %fd191;
st.shared.f64 [%rd23], %fd193;
$L__BB0_75:
bar.sync 0;
shr.u32 %r36, %r321, 1;
setp.gt.u32 %p59, %r321, 3;
mov.u32 %r321, %r36;
@%p59 bra $L__BB0_73;
$L__BB0_76:
mov.f64 %fd284, 0d0000000000000000;
@%p48 bra $L__BB0_79;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f64 %fd195, [%rd23];
add.f64 %fd284, %fd195, 0d0000000000000000;
@%p61 bra $L__BB0_79;
ld.shared.f64 %fd196, [%rd25];
add.f64 %fd284, %fd284, %fd196;
$L__BB0_79:
bar.sync 0;
st.shared.f64 [%rd23], %fd281;
bar.sync 0;
@%p44 bra $L__BB0_81;
ld.shared.f64 %fd197, [%rd24];
ld.shared.f64 %fd198, [%rd23];
add.f64 %fd199, %fd197, %fd198;
st.shared.f64 [%rd23], %fd199;
$L__BB0_81:
bar.sync 0;
@%p45 bra $L__BB0_85;
$L__BB0_82:
setp.ge.u32 %p64, %r8, %r322;
@%p64 bra $L__BB0_84;
mad.lo.s32 %r224, %r322, %r3, %r28;
mul.wide.s32 %rd103, %r224, 8;
add.s64 %rd105, %rd43, %rd103;
ld.shared.f64 %fd200, [%rd23];
ld.shared.f64 %fd201, [%rd105];
add.f64 %fd202, %fd201, %fd200;
st.shared.f64 [%rd23], %fd202;
$L__BB0_84:
bar.sync 0;
shr.u32 %r38, %r322, 1;
setp.gt.u32 %p65, %r322, 3;
mov.u32 %r322, %r38;
@%p65 bra $L__BB0_82;
$L__BB0_85:
mov.f64 %fd285, 0d0000000000000000;
@%p48 bra $L__BB0_88;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f64 %fd204, [%rd23];
add.f64 %fd285, %fd204, 0d0000000000000000;
@%p67 bra $L__BB0_88;
ld.shared.f64 %fd205, [%rd25];
add.f64 %fd285, %fd285, %fd205;
$L__BB0_88:
setp.eq.s32 %p116, %r8, 0;
and.pred %p115, %p116, %p1;
bar.sync 0;
@%p115 bra $L__BB0_89;
bra.uni $L__BB0_90;
$L__BB0_89:
shl.b32 %r313, %r6, 1;
mov.u32 %r233, %ctaid.y;
mad.lo.s32 %r234, %r110, %r233, %r313;
mul.wide.s32 %rd108, %r234, 8;
add.s64 %rd106, %rd40, %rd108;
mov.b64 %rd109, %fd282;
mov.b64 {%r225, %r226}, %rd109;
mov.b64 %rd110, %fd283;
mov.b64 {%r227, %r228}, %rd110;
// begin inline asm
st.volatile.global.v4.s32 [%rd106], {%r225,%r226,%r227,%r228};
// end inline asm
add.s64 %rd107, %rd41, %rd108;
mov.b64 %rd111, %fd284;
mov.b64 {%r229, %r230}, %rd111;
mov.b64 %rd112, %fd285;
mov.b64 {%r231, %r232}, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd107], {%r229,%r230,%r231,%r232};
// end inline asm
$L__BB0_90:
mov.u32 %r39, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r235, %r6, %r8;
or.b32 %r237, %r235, %r209;
setp.ne.s32 %p68, %r237, 0;
@%p68 bra $L__BB0_94;
ld.param.u64 %rd157, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd157;
mov.u32 %r238, %ctaid.x;
mov.u32 %r239, %ctaid.z;
mov.u32 %r240, %nctaid.x;
mad.lo.s32 %r241, %r239, %r240, %r238;
mul.wide.s32 %rd114, %r241, 8;
add.s64 %rd28, %rd113, %rd114;
add.s32 %r242, %r9, -1;
setp.eq.s32 %p69, %r39, %r242;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p69;
atom.global.add.u64 %rd29, [%rd28], %rd118;
ld.volatile.global.u64 %rd119, [%rd28];
xor.b64 %rd120, %rd119, %rd29;
setp.lt.s64 %p70, %rd120, 0;
@%p70 bra $L__BB0_94;
mov.u32 %r323, 8;
$L__BB0_93:
// begin inline asm
nanosleep.u32 %r323;
// end inline asm
setp.lt.u32 %p71, %r323, 256;
selp.u32 %r245, 1, 0, %p71;
shl.b32 %r323, %r323, %r245;
ld.volatile.global.u64 %rd121, [%rd28];
xor.b64 %rd122, %rd121, %rd29;
setp.gt.s64 %p72, %rd122, -1;
@%p72 bra $L__BB0_93;
$L__BB0_94:
bar.sync 0;
add.s32 %r246, %r9, %r3;
add.s32 %r247, %r246, -1;
div.s32 %r42, %r247, %r3;
setp.lt.s32 %p73, %r42, 1;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd291, %fd290;
@%p73 bra $L__BB0_100;
add.s32 %r249, %r4, %r2;
add.s32 %r250, %r249, -1;
shl.b32 %r251, %r8, 1;
shl.b32 %r252, %r4, 1;
mad.lo.s32 %r253, %r252, %r39, %r251;
or.b32 %r254, %r253, 1;
setp.ge.s32 %p74, %r254, %r110;
div.s32 %r255, %r250, %r4;
setp.ge.s32 %p75, %r39, %r255;
or.pred %p6, %p75, %p74;
mul.lo.s32 %r256, %r4, %r39;
shl.b32 %r257, %r256, 1;
mad.lo.s32 %r258, %r110, %r6, %r257;
add.s32 %r325, %r258, %r251;
mul.lo.s32 %r44, %r110, %r3;
mov.u32 %r326, 0;
mov.f64 %fd209, 0d0000000000000000;
mov.u32 %r324, %r6;
mov.f64 %fd290, %fd209;
mov.f64 %fd291, %fd209;
$L__BB0_96:
.pragma "nounroll";
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p6 bra $L__BB0_99;
setp.ge.s32 %p76, %r324, %r9;
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p76 bra $L__BB0_99;
mul.wide.s32 %rd124, %r325, 8;
add.s64 %rd123, %rd41, %rd124;
// begin inline asm
ld.volatile.global.v4.s32 {%r259,%r260,%r261,%r262}, [%rd123];
// end inline asm
mov.b64 %rd125, {%r259, %r260};
mov.b64 %fd289, %rd125;
mov.b64 %rd126, {%r261, %r262};
mov.b64 %fd288, %rd126;
$L__BB0_99:
add.f64 %fd290, %fd290, %fd289;
add.f64 %fd291, %fd291, %fd288;
add.s32 %r325, %r325, %r44;
add.s32 %r324, %r324, %r3;
add.s32 %r326, %r326, 1;
setp.lt.s32 %p77, %r326, %r42;
@%p77 bra $L__BB0_96;
$L__BB0_100:
clz.b32 %r263, %r3;
mov.u32 %r264, 31;
sub.s32 %r265, %r264, %r263;
mov.u32 %r266, 1;
shl.b32 %r51, %r266, %r265;
setp.lt.u32 %p78, %r6, %r51;
add.s32 %r267, %r51, %r6;
setp.lt.u32 %p79, %r267, %r3;
and.pred %p7, %p78, %p79;
add.s32 %r268, %r28, %r51;
mul.wide.s32 %rd127, %r268, 8;
add.s64 %rd30, %rd43, %rd127;
shr.u32 %r269, %r51, 31;
add.s32 %r270, %r51, %r269;
shr.s32 %r333, %r270, 1;
st.shared.f64 [%rd23], %fd290;
bar.sync 0;
not.pred %p80, %p7;
@%p80 bra $L__BB0_102;
ld.shared.f64 %fd214, [%rd30];
ld.shared.f64 %fd215, [%rd23];
add.f64 %fd216, %fd214, %fd215;
st.shared.f64 [%rd23], %fd216;
$L__BB0_102:
setp.lt.s32 %p81, %r51, 4;
bar.sync 0;
@%p81 bra $L__BB0_107;
mov.u32 %r327, %r333;
$L__BB0_104:
setp.ge.u32 %p82, %r6, %r327;
@%p82 bra $L__BB0_106;
add.s32 %r271, %r327, %r28;
mul.wide.s32 %rd129, %r271, 8;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f64 %fd217, [%rd23];
ld.shared.f64 %fd218, [%rd131];
add.f64 %fd219, %fd218, %fd217;
st.shared.f64 [%rd23], %fd219;
$L__BB0_106:
bar.sync 0;
shr.u32 %r54, %r327, 1;
setp.gt.u32 %p83, %r327, 3;
mov.u32 %r327, %r54;
@%p83 bra $L__BB0_104;
$L__BB0_107:
add.s32 %r272, %r28, 1;
mul.wide.u32 %rd132, %r272, 8;
add.s64 %rd31, %rd43, %rd132;
setp.ne.s32 %p84, %r6, 0;
mov.f64 %fd292, 0d0000000000000000;
@%p84 bra $L__BB0_110;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f64 %fd221, [%rd23];
add.f64 %fd292, %fd221, 0d0000000000000000;
@%p85 bra $L__BB0_110;
ld.shared.f64 %fd222, [%rd31];
add.f64 %fd292, %fd292, %fd222;
$L__BB0_110:
bar.sync 0;
st.shared.f64 [%rd23], %fd291;
bar.sync 0;
@%p80 bra $L__BB0_112;
ld.shared.f64 %fd223, [%rd30];
ld.shared.f64 %fd224, [%rd23];
add.f64 %fd225, %fd223, %fd224;
st.shared.f64 [%rd23], %fd225;
$L__BB0_112:
bar.sync 0;
@%p81 bra $L__BB0_117;
mov.u32 %r328, %r333;
$L__BB0_114:
setp.ge.u32 %p88, %r6, %r328;
@%p88 bra $L__BB0_116;
add.s32 %r273, %r328, %r28;
mul.wide.s32 %rd134, %r273, 8;
add.s64 %rd136, %rd43, %rd134;
ld.shared.f64 %fd226, [%rd23];
ld.shared.f64 %fd227, [%rd136];
add.f64 %fd228, %fd227, %fd226;
st.shared.f64 [%rd23], %fd228;
$L__BB0_116:
bar.sync 0;
shr.u32 %r56, %r328, 1;
setp.gt.u32 %p89, %r328, 3;
mov.u32 %r328, %r56;
@%p89 bra $L__BB0_114;
$L__BB0_117:
mov.f64 %fd293, 0d0000000000000000;
@%p84 bra $L__BB0_120;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f64 %fd230, [%rd23];
add.f64 %fd293, %fd230, 0d0000000000000000;
@%p91 bra $L__BB0_120;
ld.shared.f64 %fd231, [%rd31];
add.f64 %fd293, %fd293, %fd231;
$L__BB0_120:
bar.sync 0;
@%p84 bra $L__BB0_124;
add.s32 %r274, %r4, %r2;
add.s32 %r275, %r274, -1;
div.s32 %r276, %r275, %r4;
setp.ge.s32 %p93, %r39, %r276;
@%p93 bra $L__BB0_124;
shl.b32 %r57, %r8, 1;
mul.lo.s32 %r277, %r4, %r39;
shl.b32 %r58, %r277, 1;
add.s32 %r278, %r57, %r58;
or.b32 %r279, %r278, 1;
setp.ge.s32 %p94, %r279, %r110;
@%p94 bra $L__BB0_124;
ld.param.u64 %rd156, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r284, %r58, %r57;
mul.wide.s32 %rd138, %r284, 8;
add.s64 %rd137, %rd156, %rd138;
mov.b64 %rd139, %fd292;
mov.b64 {%r280, %r281}, %rd139;
mov.b64 %rd140, %fd293;
mov.b64 {%r282, %r283}, %rd140;
// begin inline asm
st.global.cs.v4.s32 [%rd137], {%r280,%r281,%r282,%r283};
// end inline asm
$L__BB0_124:
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd299, %fd298;
@%p73 bra $L__BB0_130;
add.s32 %r286, %r4, %r2;
add.s32 %r287, %r286, -1;
shl.b32 %r288, %r8, 1;
shl.b32 %r289, %r4, 1;
mad.lo.s32 %r290, %r289, %r39, %r288;
or.b32 %r291, %r290, 1;
setp.ge.s32 %p96, %r291, %r110;
div.s32 %r292, %r287, %r4;
setp.ge.s32 %p97, %r39, %r292;
or.pred %p8, %p97, %p96;
mul.lo.s32 %r293, %r4, %r39;
shl.b32 %r294, %r293, 1;
mad.lo.s32 %r295, %r110, %r6, %r294;
add.s32 %r330, %r295, %r288;
mul.lo.s32 %r60, %r110, %r3;
mov.u32 %r331, 0;
mov.f64 %fd235, 0d0000000000000000;
mov.u32 %r329, %r6;
mov.f64 %fd298, %fd235;
mov.f64 %fd299, %fd235;
$L__BB0_126:
.pragma "nounroll";
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p8 bra $L__BB0_129;
setp.ge.s32 %p98, %r329, %r9;
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p98 bra $L__BB0_129;
mul.wide.s32 %rd142, %r330, 8;
add.s64 %rd141, %rd40, %rd142;
// begin inline asm
ld.volatile.global.v4.s32 {%r296,%r297,%r298,%r299}, [%rd141];
// end inline asm
mov.b64 %rd143, {%r296, %r297};
mov.b64 %fd297, %rd143;
mov.b64 %rd144, {%r298, %r299};
mov.b64 %fd296, %rd144;
$L__BB0_129:
add.f64 %fd298, %fd298, %fd297;
add.f64 %fd299, %fd299, %fd296;
add.s32 %r330, %r330, %r60;
add.s32 %r329, %r329, %r3;
add.s32 %r331, %r331, 1;
setp.lt.s32 %p99, %r331, %r42;
@%p99 bra $L__BB0_126;
$L__BB0_130:
st.shared.f64 [%rd23], %fd298;
bar.sync 0;
@%p80 bra $L__BB0_132;
ld.shared.f64 %fd240, [%rd30];
ld.shared.f64 %fd241, [%rd23];
add.f64 %fd242, %fd240, %fd241;
st.shared.f64 [%rd23], %fd242;
$L__BB0_132:
bar.sync 0;
@%p81 bra $L__BB0_137;
mov.u32 %r332, %r333;
$L__BB0_134:
setp.ge.u32 %p102, %r6, %r332;
@%p102 bra $L__BB0_136;
add.s32 %r300, %r332, %r28;
mul.wide.s32 %rd145, %r300, 8;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f64 %fd243, [%rd23];
ld.shared.f64 %fd244, [%rd147];
add.f64 %fd245, %fd244, %fd243;
st.shared.f64 [%rd23], %fd245;
$L__BB0_136:
bar.sync 0;
shr.u32 %r68, %r332, 1;
setp.gt.u32 %p103, %r332, 3;
mov.u32 %r332, %r68;
@%p103 bra $L__BB0_134;
$L__BB0_137:
mov.f64 %fd300, 0d0000000000000000;
@%p84 bra $L__BB0_140;
setp.lt.u32 %p105, %r3, 2;
ld.shared.f64 %fd247, [%rd23];
add.f64 %fd300, %fd247, 0d0000000000000000;
@%p105 bra $L__BB0_140;
ld.shared.f64 %fd248, [%rd31];
add.f64 %fd300, %fd300, %fd248;
$L__BB0_140:
bar.sync 0;
st.shared.f64 [%rd23], %fd299;
bar.sync 0;
@%p80 bra $L__BB0_142;
ld.shared.f64 %fd249, [%rd30];
ld.shared.f64 %fd250, [%rd23];
add.f64 %fd251, %fd249, %fd250;
st.shared.f64 [%rd23], %fd251;
$L__BB0_142:
bar.sync 0;
@%p81 bra $L__BB0_146;
$L__BB0_143:
setp.ge.u32 %p108, %r6, %r333;
@%p108 bra $L__BB0_145;
add.s32 %r301, %r333, %r28;
mul.wide.s32 %rd148, %r301, 8;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f64 %fd252, [%rd23];
ld.shared.f64 %fd253, [%rd150];
add.f64 %fd254, %fd253, %fd252;
st.shared.f64 [%rd23], %fd254;
$L__BB0_145:
bar.sync 0;
shr.u32 %r70, %r333, 1;
setp.gt.u32 %p109, %r333, 3;
mov.u32 %r333, %r70;
@%p109 bra $L__BB0_143;
$L__BB0_146:
mov.f64 %fd301, 0d0000000000000000;
@%p84 bra $L__BB0_149;
setp.lt.u32 %p111, %r3, 2;
ld.shared.f64 %fd256, [%rd23];
add.f64 %fd301, %fd256, 0d0000000000000000;
@%p111 bra $L__BB0_149;
ld.shared.f64 %fd257, [%rd31];
add.f64 %fd301, %fd301, %fd257;
$L__BB0_149:
bar.sync 0;
@%p84 bra $L__BB0_153;
add.s32 %r302, %r4, %r2;
add.s32 %r303, %r302, -1;
div.s32 %r304, %r303, %r4;
setp.ge.s32 %p113, %r39, %r304;
@%p113 bra $L__BB0_153;
shl.b32 %r71, %r8, 1;
mul.lo.s32 %r305, %r4, %r39;
shl.b32 %r72, %r305, 1;
add.s32 %r306, %r71, %r72;
or.b32 %r307, %r306, 1;
setp.ge.s32 %p114, %r307, %r110;
@%p114 bra $L__BB0_153;
ld.param.u64 %rd155, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_f3d3a0e4_103399nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r312, %r72, %r71;
mul.wide.s32 %rd152, %r312, 8;
add.s64 %rd151, %rd155, %rd152;
mov.b64 %rd153, %fd300;
mov.b64 {%r308, %r309}, %rd153;
mov.b64 %rd154, %fd301;
mov.b64 {%r310, %r311}, %rd154;
// begin inline asm
st.global.cs.v4.s32 [%rd151], {%r308,%r309,%r310,%r311};
// end inline asm
$L__BB0_153:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72335arrayE[];
.entry _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<119>;
.reg .b32 %r<332>;
.reg .f64 %fd<302>;
.reg .b64 %rd<158>;
ld.param.v2.u32 {%r108, %r109}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r118, %r119}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r122, %r123}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r144, %r109, 1;
shr.u32 %r145, %r144, 31;
add.s32 %r146, %r144, %r145;
shr.s32 %r2, %r146, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r147, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r148, %r4, 3;
mad.lo.s32 %r149, %r148, %r147, 15;
and.b32 %r150, %r149, -16;
cvt.u64.u32 %rd1, %r150;
mul.lo.s32 %r151, %r4, %r2;
shl.b32 %r152, %r151, 4;
or.b32 %r153, %r152, 15;
and.b32 %r5, %r153, -16;
add.s32 %r154, %r153, %r5;
and.b32 %r155, %r154, -16;
cvt.s64.s32 %rd2, %r155;
mov.u64 %rd43, _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
cvt.rn.f64.s32 %fd1, %r109;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r156, %r7, 1;
setp.lt.s32 %p10, %r156, %r109;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r157, smem_ptr; }
// end inline asm
shl.b32 %r160, %r6, 4;
add.s32 %r158, %r157, %r160;
mul.wide.s32 %rd47, %r7, 8;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r159, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r159, 0;
cp.async.ca.shared.global [%r158], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r161, %r4, 215;
div.s32 %r162, %r161, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r163, %r9, %r162;
add.s32 %r164, %r163, -1;
div.s32 %r10, %r164, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r166, %ctaid.y;
mul.lo.s32 %r167, %r10, %r4;
mul.lo.s32 %r11, %r167, %r166;
mad.lo.s32 %r168, %r2, %r8, %r6;
shl.b32 %r12, %r168, 4;
mul.lo.s32 %r169, %r109, %r8;
cvt.s64.s32 %rd52, %r169;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r170, %r11, %r109;
cvt.s64.s32 %rd6, %r170;
mul.lo.s32 %r13, %r109, %r4;
mul.lo.s32 %r14, %r10, %r166;
shl.b32 %r171, %r8, 1;
mov.u32 %r172, 1;
mad.lo.s32 %r173, %r171, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r173, 8;
add.s64 %rd7, %rd54, %rd55;
mov.u32 %r174, %tid.z;
mad.lo.s32 %r175, %r4, %r174, %r8;
mad.lo.s32 %r15, %r175, %r3, %r6;
mul.wide.u32 %rd56, %r15, 8;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r176, %r3;
mov.u32 %r177, 31;
sub.s32 %r178, %r177, %r176;
shl.b32 %r16, %r172, %r178;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r179, %r16, %r6;
setp.lt.u32 %p15, %r179, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r180, %r15, %r16;
mul.wide.s32 %rd57, %r180, 8;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r181, %r16, 31;
add.s32 %r182, %r16, %r181;
shr.s32 %r17, %r182, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r183, %r15, 1;
mul.wide.u32 %rd58, %r183, 8;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r175, 8;
add.s64 %rd13, %rd43, %rd61;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r314, 0;
mov.f64 %fd272, 0d0000000000000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r186, smem_ptr; }
// end inline asm
add.s32 %r187, %r186, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r196, smem_ptr; }
// end inline asm
add.s32 %r197, %r196, %r12;
not.pred %p26, %p3;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r184, %r314, %r4, %r8;
add.s32 %r185, %r184, %r11;
setp.gt.s32 %p17, %r185, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r189, %r13, %r314;
cvt.s64.s32 %rd65, %r189;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r188, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r188, 0;
cp.async.ca.shared.global [%r187], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r190, %r14, %r314;
mad.lo.s32 %r191, %r190, %r4, %r8;
setp.lt.s32 %p19, %r191, 216;
@%p19 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd268, 0d0000000000000000;
mov.f64 %fd269, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r192, %r14, %r314;
mad.lo.s32 %r193, %r192, %r4, %r8;
setp.gt.s32 %p20, %r193, 215;
@%p20 bra $L__BB0_14;
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
$L__BB0_14:
add.s32 %r194, %r14, %r314;
mad.lo.s32 %r21, %r194, %r4, %r8;
add.f64 %fd281, %fd281, %fd269;
add.f64 %fd280, %fd280, %fd268;
setp.gt.s32 %p21, %r21, 215;
mov.f64 %fd270, 0d0000000000000000;
@%p21 bra $L__BB0_16;
mul.lo.s32 %r195, %r21, %r118;
mul.wide.s32 %rd69, %r195, 8;
add.s64 %rd70, %rd16, %rd69;
ld.global.f64 %fd270, [%rd70];
$L__BB0_16:
setp.lt.s32 %p22, %r21, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_18;
mul.lo.s32 %r199, %r13, %r314;
cvt.s64.s32 %rd73, %r199;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r198, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r198, 0;
cp.async.ca.shared.global [%r197], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
add.s32 %r313, %r14, %r314;
mad.lo.s32 %r312, %r313, %r4, %r8;
setp.gt.s32 %p117, %r312, 215;
mov.f64 %fd274, 0d0000000000000000;
mov.f64 %fd271, %fd274;
@%p117 bra $L__BB0_20;
mul.lo.s32 %r200, %r21, %r122;
mul.wide.s32 %rd77, %r200, 8;
add.s64 %rd78, %rd17, %rd77;
ld.global.f64 %fd271, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd275, %fd274;
@%p23 bra $L__BB0_22;
ld.shared.v2.f64 {%fd103, %fd104}, [%rd7];
ld.shared.v2.f64 {%fd107, %fd108}, [%rd10];
ld.shared.v2.f64 {%fd111, %fd112}, [%rd12];
mul.f64 %fd115, %fd111, %fd103;
add.f64 %fd116, %fd115, 0d0000000000000000;
sub.f64 %fd117, %fd107, %fd270;
mul.f64 %fd118, %fd271, %fd117;
fma.rn.f64 %fd119, %fd115, %fd118, 0d0000000000000000;
fma.rn.f64 %fd272, %fd118, %fd103, %fd272;
mul.f64 %fd120, %fd112, %fd104;
add.f64 %fd275, %fd116, %fd120;
sub.f64 %fd121, %fd108, %fd270;
mul.f64 %fd122, %fd271, %fd121;
fma.rn.f64 %fd274, %fd120, %fd122, %fd119;
fma.rn.f64 %fd273, %fd122, %fd104, %fd273;
$L__BB0_22:
st.shared.f64 [%rd8], %fd275;
bar.sync 0;
@%p26 bra $L__BB0_24;
ld.shared.f64 %fd123, [%rd9];
ld.shared.f64 %fd124, [%rd8];
add.f64 %fd125, %fd123, %fd124;
st.shared.f64 [%rd8], %fd125;
$L__BB0_24:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_29;
mov.u32 %r315, %r17;
$L__BB0_26:
setp.ge.u32 %p28, %r6, %r315;
@%p28 bra $L__BB0_28;
add.s32 %r201, %r315, %r15;
mul.wide.s32 %rd79, %r201, 8;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f64 %fd126, [%rd8];
ld.shared.f64 %fd127, [%rd81];
add.f64 %fd128, %fd127, %fd126;
st.shared.f64 [%rd8], %fd128;
$L__BB0_28:
bar.sync 0;
shr.u32 %r23, %r315, 1;
setp.gt.u32 %p29, %r315, 3;
mov.u32 %r315, %r23;
@%p29 bra $L__BB0_26;
$L__BB0_29:
setp.ne.s32 %p30, %r6, 0;
mov.f64 %fd276, 0d0000000000000000;
@%p30 bra $L__BB0_32;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f64 %fd130, [%rd8];
add.f64 %fd276, %fd130, 0d0000000000000000;
@%p31 bra $L__BB0_32;
ld.shared.f64 %fd131, [%rd11];
add.f64 %fd276, %fd276, %fd131;
$L__BB0_32:
bar.sync 0;
st.shared.f64 [%rd8], %fd274;
bar.sync 0;
@%p26 bra $L__BB0_34;
ld.shared.f64 %fd132, [%rd9];
ld.shared.f64 %fd133, [%rd8];
add.f64 %fd134, %fd132, %fd133;
st.shared.f64 [%rd8], %fd134;
$L__BB0_34:
setp.lt.s32 %p118, %r16, 4;
bar.sync 0;
@%p118 bra $L__BB0_39;
mov.u32 %r316, %r17;
$L__BB0_36:
setp.ge.u32 %p34, %r6, %r316;
@%p34 bra $L__BB0_38;
add.s32 %r202, %r316, %r15;
mul.wide.s32 %rd82, %r202, 8;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f64 %fd135, [%rd8];
ld.shared.f64 %fd136, [%rd84];
add.f64 %fd137, %fd136, %fd135;
st.shared.f64 [%rd8], %fd137;
$L__BB0_38:
bar.sync 0;
shr.u32 %r25, %r316, 1;
setp.gt.u32 %p35, %r316, 3;
mov.u32 %r316, %r25;
@%p35 bra $L__BB0_36;
$L__BB0_39:
mov.f64 %fd277, 0d0000000000000000;
@%p30 bra $L__BB0_42;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f64 %fd139, [%rd8];
add.f64 %fd277, %fd139, 0d0000000000000000;
@%p37 bra $L__BB0_42;
ld.shared.f64 %fd140, [%rd11];
add.f64 %fd277, %fd277, %fd140;
$L__BB0_42:
bar.sync 0;
@%p30 bra $L__BB0_44;
st.shared.f64 [%rd13], %fd276;
$L__BB0_44:
bar.sync 0;
ld.shared.f64 %fd33, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_46;
st.shared.f64 [%rd13], %fd277;
$L__BB0_46:
bar.sync 0;
ld.shared.f64 %fd34, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_48;
mul.f64 %fd141, %fd2, %fd271;
ld.shared.v2.f64 {%fd142, %fd143}, [%rd10];
ld.shared.v2.f64 {%fd146, %fd147}, [%rd12];
ld.shared.v2.f64 {%fd150, %fd151}, [%rd7];
mul.f64 %fd154, %fd146, %fd150;
mul.f64 %fd155, %fd154, %fd1;
sub.f64 %fd156, %fd142, %fd270;
mul.f64 %fd157, %fd271, %fd156;
sub.f64 %fd158, %fd155, %fd33;
mul.f64 %fd159, %fd34, %fd157;
sub.f64 %fd160, %fd158, %fd159;
mul.f64 %fd161, %fd141, %fd160;
mov.b64 %rd86, %fd161;
mul.f64 %fd162, %fd147, %fd151;
mul.f64 %fd163, %fd162, %fd1;
sub.f64 %fd164, %fd143, %fd270;
mul.f64 %fd165, %fd271, %fd164;
sub.f64 %fd166, %fd163, %fd33;
mul.f64 %fd167, %fd34, %fd165;
sub.f64 %fd168, %fd166, %fd167;
mul.f64 %fd169, %fd141, %fd168;
mov.b64 %rd87, %fd169;
mad.lo.s32 %r207, %r21, %r109, %r7;
mul.wide.s32 %rd88, %r207, 8;
add.s64 %rd85, %rd37, %rd88;
mov.b64 {%r203, %r204}, %rd86;
mov.b64 {%r205, %r206}, %rd87;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r203,%r204,%r205,%r206};
// end inline asm
$L__BB0_48:
add.s32 %r314, %r314, 1;
setp.lt.s32 %p41, %r314, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_49;
$L__BB0_3:
mov.f64 %fd272, 0d0000000000000000;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_49:
mov.u32 %r208, %tid.z;
mad.lo.s32 %r209, %r4, %r208, %r8;
mad.lo.s32 %r27, %r209, %r3, %r6;
mul.wide.u32 %rd89, %r27, 8;
add.s64 %rd23, %rd43, %rd89;
clz.b32 %r210, %r4;
mov.u32 %r211, 31;
sub.s32 %r212, %r211, %r210;
mov.u32 %r213, 1;
shl.b32 %r28, %r213, %r212;
setp.lt.u32 %p42, %r8, %r28;
add.s32 %r214, %r28, %r8;
setp.lt.u32 %p43, %r214, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r215, %r3, %r212;
add.s32 %r216, %r27, %r215;
mul.wide.s32 %rd91, %r216, 8;
add.s64 %rd24, %rd43, %rd91;
shr.u32 %r217, %r28, 31;
add.s32 %r218, %r28, %r217;
shr.s32 %r320, %r218, 1;
st.shared.f64 [%rd23], %fd272;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_51;
ld.shared.f64 %fd170, [%rd24];
ld.shared.f64 %fd171, [%rd23];
add.f64 %fd172, %fd170, %fd171;
st.shared.f64 [%rd23], %fd172;
$L__BB0_51:
setp.lt.s32 %p45, %r28, 4;
bar.sync 0;
@%p45 bra $L__BB0_56;
mov.u32 %r317, %r320;
$L__BB0_53:
setp.ge.u32 %p46, %r8, %r317;
@%p46 bra $L__BB0_55;
mad.lo.s32 %r219, %r317, %r3, %r27;
mul.wide.s32 %rd92, %r219, 8;
add.s64 %rd94, %rd43, %rd92;
ld.shared.f64 %fd173, [%rd23];
ld.shared.f64 %fd174, [%rd94];
add.f64 %fd175, %fd174, %fd173;
st.shared.f64 [%rd23], %fd175;
$L__BB0_55:
bar.sync 0;
shr.u32 %r31, %r317, 1;
setp.gt.u32 %p47, %r317, 3;
mov.u32 %r317, %r31;
@%p47 bra $L__BB0_53;
$L__BB0_56:
add.s32 %r220, %r27, %r3;
mul.wide.u32 %rd95, %r220, 8;
add.s64 %rd25, %rd43, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.f64 %fd282, 0d0000000000000000;
@%p48 bra $L__BB0_59;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f64 %fd177, [%rd23];
add.f64 %fd282, %fd177, 0d0000000000000000;
@%p49 bra $L__BB0_59;
ld.shared.f64 %fd178, [%rd25];
add.f64 %fd282, %fd282, %fd178;
$L__BB0_59:
bar.sync 0;
st.shared.f64 [%rd23], %fd273;
bar.sync 0;
@%p44 bra $L__BB0_61;
ld.shared.f64 %fd179, [%rd24];
ld.shared.f64 %fd180, [%rd23];
add.f64 %fd181, %fd179, %fd180;
st.shared.f64 [%rd23], %fd181;
$L__BB0_61:
bar.sync 0;
@%p45 bra $L__BB0_66;
mov.u32 %r318, %r320;
$L__BB0_63:
setp.ge.u32 %p52, %r8, %r318;
@%p52 bra $L__BB0_65;
mad.lo.s32 %r221, %r318, %r3, %r27;
mul.wide.s32 %rd97, %r221, 8;
add.s64 %rd99, %rd43, %rd97;
ld.shared.f64 %fd182, [%rd23];
ld.shared.f64 %fd183, [%rd99];
add.f64 %fd184, %fd183, %fd182;
st.shared.f64 [%rd23], %fd184;
$L__BB0_65:
bar.sync 0;
shr.u32 %r33, %r318, 1;
setp.gt.u32 %p53, %r318, 3;
mov.u32 %r318, %r33;
@%p53 bra $L__BB0_63;
$L__BB0_66:
mov.f64 %fd283, 0d0000000000000000;
@%p48 bra $L__BB0_69;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f64 %fd186, [%rd23];
add.f64 %fd283, %fd186, 0d0000000000000000;
@%p55 bra $L__BB0_69;
ld.shared.f64 %fd187, [%rd25];
add.f64 %fd283, %fd283, %fd187;
$L__BB0_69:
bar.sync 0;
st.shared.f64 [%rd23], %fd280;
bar.sync 0;
@%p44 bra $L__BB0_71;
ld.shared.f64 %fd188, [%rd24];
ld.shared.f64 %fd189, [%rd23];
add.f64 %fd190, %fd188, %fd189;
st.shared.f64 [%rd23], %fd190;
$L__BB0_71:
bar.sync 0;
@%p45 bra $L__BB0_76;
mov.u32 %r319, %r320;
$L__BB0_73:
setp.ge.u32 %p58, %r8, %r319;
@%p58 bra $L__BB0_75;
mad.lo.s32 %r222, %r319, %r3, %r27;
mul.wide.s32 %rd100, %r222, 8;
add.s64 %rd102, %rd43, %rd100;
ld.shared.f64 %fd191, [%rd23];
ld.shared.f64 %fd192, [%rd102];
add.f64 %fd193, %fd192, %fd191;
st.shared.f64 [%rd23], %fd193;
$L__BB0_75:
bar.sync 0;
shr.u32 %r35, %r319, 1;
setp.gt.u32 %p59, %r319, 3;
mov.u32 %r319, %r35;
@%p59 bra $L__BB0_73;
$L__BB0_76:
mov.f64 %fd284, 0d0000000000000000;
@%p48 bra $L__BB0_79;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f64 %fd195, [%rd23];
add.f64 %fd284, %fd195, 0d0000000000000000;
@%p61 bra $L__BB0_79;
ld.shared.f64 %fd196, [%rd25];
add.f64 %fd284, %fd284, %fd196;
$L__BB0_79:
bar.sync 0;
st.shared.f64 [%rd23], %fd281;
bar.sync 0;
@%p44 bra $L__BB0_81;
ld.shared.f64 %fd197, [%rd24];
ld.shared.f64 %fd198, [%rd23];
add.f64 %fd199, %fd197, %fd198;
st.shared.f64 [%rd23], %fd199;
$L__BB0_81:
bar.sync 0;
@%p45 bra $L__BB0_85;
$L__BB0_82:
setp.ge.u32 %p64, %r8, %r320;
@%p64 bra $L__BB0_84;
mad.lo.s32 %r223, %r320, %r3, %r27;
mul.wide.s32 %rd103, %r223, 8;
add.s64 %rd105, %rd43, %rd103;
ld.shared.f64 %fd200, [%rd23];
ld.shared.f64 %fd201, [%rd105];
add.f64 %fd202, %fd201, %fd200;
st.shared.f64 [%rd23], %fd202;
$L__BB0_84:
bar.sync 0;
shr.u32 %r37, %r320, 1;
setp.gt.u32 %p65, %r320, 3;
mov.u32 %r320, %r37;
@%p65 bra $L__BB0_82;
$L__BB0_85:
mov.f64 %fd285, 0d0000000000000000;
@%p48 bra $L__BB0_88;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f64 %fd204, [%rd23];
add.f64 %fd285, %fd204, 0d0000000000000000;
@%p67 bra $L__BB0_88;
ld.shared.f64 %fd205, [%rd25];
add.f64 %fd285, %fd285, %fd205;
$L__BB0_88:
setp.eq.s32 %p116, %r8, 0;
and.pred %p115, %p116, %p1;
bar.sync 0;
@%p115 bra $L__BB0_89;
bra.uni $L__BB0_90;
$L__BB0_89:
mov.u32 %r232, %ctaid.y;
mad.lo.s32 %r233, %r109, %r232, %r7;
mul.wide.s32 %rd108, %r233, 8;
add.s64 %rd106, %rd40, %rd108;
mov.b64 %rd109, %fd282;
mov.b64 {%r224, %r225}, %rd109;
mov.b64 %rd110, %fd283;
mov.b64 {%r226, %r227}, %rd110;
// begin inline asm
st.volatile.global.v4.s32 [%rd106], {%r224,%r225,%r226,%r227};
// end inline asm
add.s64 %rd107, %rd41, %rd108;
mov.b64 %rd111, %fd284;
mov.b64 {%r228, %r229}, %rd111;
mov.b64 %rd112, %fd285;
mov.b64 {%r230, %r231}, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd107], {%r228,%r229,%r230,%r231};
// end inline asm
$L__BB0_90:
mov.u32 %r38, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r234, %r6, %r8;
or.b32 %r236, %r234, %r208;
setp.ne.s32 %p68, %r236, 0;
@%p68 bra $L__BB0_94;
ld.param.u64 %rd157, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd157;
mov.u32 %r237, %ctaid.x;
mov.u32 %r238, %ctaid.z;
mov.u32 %r239, %nctaid.x;
mad.lo.s32 %r240, %r238, %r239, %r237;
mul.wide.s32 %rd114, %r240, 8;
add.s64 %rd28, %rd113, %rd114;
add.s32 %r241, %r9, -1;
setp.eq.s32 %p69, %r38, %r241;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p69;
atom.global.add.u64 %rd29, [%rd28], %rd118;
ld.volatile.global.u64 %rd119, [%rd28];
xor.b64 %rd120, %rd119, %rd29;
setp.lt.s64 %p70, %rd120, 0;
@%p70 bra $L__BB0_94;
mov.u32 %r321, 8;
$L__BB0_93:
// begin inline asm
nanosleep.u32 %r321;
// end inline asm
setp.lt.u32 %p71, %r321, 256;
selp.u32 %r244, 1, 0, %p71;
shl.b32 %r321, %r321, %r244;
ld.volatile.global.u64 %rd121, [%rd28];
xor.b64 %rd122, %rd121, %rd29;
setp.gt.s64 %p72, %rd122, -1;
@%p72 bra $L__BB0_93;
$L__BB0_94:
bar.sync 0;
add.s32 %r245, %r9, %r3;
add.s32 %r246, %r245, -1;
div.s32 %r41, %r246, %r3;
setp.lt.s32 %p73, %r41, 1;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd291, %fd290;
@%p73 bra $L__BB0_100;
add.s32 %r248, %r4, %r2;
add.s32 %r249, %r248, -1;
shl.b32 %r250, %r8, 1;
shl.b32 %r251, %r4, 1;
mad.lo.s32 %r252, %r251, %r38, %r250;
or.b32 %r253, %r252, 1;
setp.ge.s32 %p74, %r253, %r109;
div.s32 %r254, %r249, %r4;
setp.ge.s32 %p75, %r38, %r254;
or.pred %p6, %p75, %p74;
mul.lo.s32 %r255, %r4, %r38;
shl.b32 %r256, %r255, 1;
mad.lo.s32 %r257, %r109, %r6, %r256;
add.s32 %r323, %r257, %r250;
mul.lo.s32 %r43, %r109, %r3;
mov.u32 %r324, 0;
mov.f64 %fd209, 0d0000000000000000;
mov.u32 %r322, %r6;
mov.f64 %fd290, %fd209;
mov.f64 %fd291, %fd209;
$L__BB0_96:
.pragma "nounroll";
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p6 bra $L__BB0_99;
setp.ge.s32 %p76, %r322, %r9;
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p76 bra $L__BB0_99;
mul.wide.s32 %rd124, %r323, 8;
add.s64 %rd123, %rd41, %rd124;
// begin inline asm
ld.volatile.global.v4.s32 {%r258,%r259,%r260,%r261}, [%rd123];
// end inline asm
mov.b64 %rd125, {%r258, %r259};
mov.b64 %fd289, %rd125;
mov.b64 %rd126, {%r260, %r261};
mov.b64 %fd288, %rd126;
$L__BB0_99:
add.f64 %fd290, %fd290, %fd289;
add.f64 %fd291, %fd291, %fd288;
add.s32 %r323, %r323, %r43;
add.s32 %r322, %r322, %r3;
add.s32 %r324, %r324, 1;
setp.lt.s32 %p77, %r324, %r41;
@%p77 bra $L__BB0_96;
$L__BB0_100:
clz.b32 %r262, %r3;
mov.u32 %r263, 31;
sub.s32 %r264, %r263, %r262;
mov.u32 %r265, 1;
shl.b32 %r50, %r265, %r264;
setp.lt.u32 %p78, %r6, %r50;
add.s32 %r266, %r50, %r6;
setp.lt.u32 %p79, %r266, %r3;
and.pred %p7, %p78, %p79;
add.s32 %r267, %r27, %r50;
mul.wide.s32 %rd127, %r267, 8;
add.s64 %rd30, %rd43, %rd127;
shr.u32 %r268, %r50, 31;
add.s32 %r269, %r50, %r268;
shr.s32 %r331, %r269, 1;
st.shared.f64 [%rd23], %fd290;
bar.sync 0;
not.pred %p80, %p7;
@%p80 bra $L__BB0_102;
ld.shared.f64 %fd214, [%rd30];
ld.shared.f64 %fd215, [%rd23];
add.f64 %fd216, %fd214, %fd215;
st.shared.f64 [%rd23], %fd216;
$L__BB0_102:
setp.lt.s32 %p81, %r50, 4;
bar.sync 0;
@%p81 bra $L__BB0_107;
mov.u32 %r325, %r331;
$L__BB0_104:
setp.ge.u32 %p82, %r6, %r325;
@%p82 bra $L__BB0_106;
add.s32 %r270, %r325, %r27;
mul.wide.s32 %rd129, %r270, 8;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f64 %fd217, [%rd23];
ld.shared.f64 %fd218, [%rd131];
add.f64 %fd219, %fd218, %fd217;
st.shared.f64 [%rd23], %fd219;
$L__BB0_106:
bar.sync 0;
shr.u32 %r53, %r325, 1;
setp.gt.u32 %p83, %r325, 3;
mov.u32 %r325, %r53;
@%p83 bra $L__BB0_104;
$L__BB0_107:
add.s32 %r271, %r27, 1;
mul.wide.u32 %rd132, %r271, 8;
add.s64 %rd31, %rd43, %rd132;
setp.ne.s32 %p84, %r6, 0;
mov.f64 %fd292, 0d0000000000000000;
@%p84 bra $L__BB0_110;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f64 %fd221, [%rd23];
add.f64 %fd292, %fd221, 0d0000000000000000;
@%p85 bra $L__BB0_110;
ld.shared.f64 %fd222, [%rd31];
add.f64 %fd292, %fd292, %fd222;
$L__BB0_110:
bar.sync 0;
st.shared.f64 [%rd23], %fd291;
bar.sync 0;
@%p80 bra $L__BB0_112;
ld.shared.f64 %fd223, [%rd30];
ld.shared.f64 %fd224, [%rd23];
add.f64 %fd225, %fd223, %fd224;
st.shared.f64 [%rd23], %fd225;
$L__BB0_112:
bar.sync 0;
@%p81 bra $L__BB0_117;
mov.u32 %r326, %r331;
$L__BB0_114:
setp.ge.u32 %p88, %r6, %r326;
@%p88 bra $L__BB0_116;
add.s32 %r272, %r326, %r27;
mul.wide.s32 %rd134, %r272, 8;
add.s64 %rd136, %rd43, %rd134;
ld.shared.f64 %fd226, [%rd23];
ld.shared.f64 %fd227, [%rd136];
add.f64 %fd228, %fd227, %fd226;
st.shared.f64 [%rd23], %fd228;
$L__BB0_116:
bar.sync 0;
shr.u32 %r55, %r326, 1;
setp.gt.u32 %p89, %r326, 3;
mov.u32 %r326, %r55;
@%p89 bra $L__BB0_114;
$L__BB0_117:
mov.f64 %fd293, 0d0000000000000000;
@%p84 bra $L__BB0_120;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f64 %fd230, [%rd23];
add.f64 %fd293, %fd230, 0d0000000000000000;
@%p91 bra $L__BB0_120;
ld.shared.f64 %fd231, [%rd31];
add.f64 %fd293, %fd293, %fd231;
$L__BB0_120:
bar.sync 0;
@%p84 bra $L__BB0_124;
add.s32 %r273, %r4, %r2;
add.s32 %r274, %r273, -1;
div.s32 %r275, %r274, %r4;
setp.ge.s32 %p93, %r38, %r275;
@%p93 bra $L__BB0_124;
shl.b32 %r56, %r8, 1;
mul.lo.s32 %r276, %r4, %r38;
shl.b32 %r57, %r276, 1;
add.s32 %r277, %r56, %r57;
or.b32 %r278, %r277, 1;
setp.ge.s32 %p94, %r278, %r109;
@%p94 bra $L__BB0_124;
ld.param.u64 %rd156, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r283, %r57, %r56;
mul.wide.s32 %rd138, %r283, 8;
add.s64 %rd137, %rd156, %rd138;
mov.b64 %rd139, %fd292;
mov.b64 {%r279, %r280}, %rd139;
mov.b64 %rd140, %fd293;
mov.b64 {%r281, %r282}, %rd140;
// begin inline asm
st.global.cs.v4.s32 [%rd137], {%r279,%r280,%r281,%r282};
// end inline asm
$L__BB0_124:
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd299, %fd298;
@%p73 bra $L__BB0_130;
add.s32 %r285, %r4, %r2;
add.s32 %r286, %r285, -1;
shl.b32 %r287, %r8, 1;
shl.b32 %r288, %r4, 1;
mad.lo.s32 %r289, %r288, %r38, %r287;
or.b32 %r290, %r289, 1;
setp.ge.s32 %p96, %r290, %r109;
div.s32 %r291, %r286, %r4;
setp.ge.s32 %p97, %r38, %r291;
or.pred %p8, %p97, %p96;
mul.lo.s32 %r292, %r4, %r38;
shl.b32 %r293, %r292, 1;
mad.lo.s32 %r294, %r109, %r6, %r293;
add.s32 %r328, %r294, %r287;
mul.lo.s32 %r59, %r109, %r3;
mov.u32 %r329, 0;
mov.f64 %fd235, 0d0000000000000000;
mov.u32 %r327, %r6;
mov.f64 %fd298, %fd235;
mov.f64 %fd299, %fd235;
$L__BB0_126:
.pragma "nounroll";
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p8 bra $L__BB0_129;
setp.ge.s32 %p98, %r327, %r9;
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p98 bra $L__BB0_129;
mul.wide.s32 %rd142, %r328, 8;
add.s64 %rd141, %rd40, %rd142;
// begin inline asm
ld.volatile.global.v4.s32 {%r295,%r296,%r297,%r298}, [%rd141];
// end inline asm
mov.b64 %rd143, {%r295, %r296};
mov.b64 %fd297, %rd143;
mov.b64 %rd144, {%r297, %r298};
mov.b64 %fd296, %rd144;
$L__BB0_129:
add.f64 %fd298, %fd298, %fd297;
add.f64 %fd299, %fd299, %fd296;
add.s32 %r328, %r328, %r59;
add.s32 %r327, %r327, %r3;
add.s32 %r329, %r329, 1;
setp.lt.s32 %p99, %r329, %r41;
@%p99 bra $L__BB0_126;
$L__BB0_130:
st.shared.f64 [%rd23], %fd298;
bar.sync 0;
@%p80 bra $L__BB0_132;
ld.shared.f64 %fd240, [%rd30];
ld.shared.f64 %fd241, [%rd23];
add.f64 %fd242, %fd240, %fd241;
st.shared.f64 [%rd23], %fd242;
$L__BB0_132:
bar.sync 0;
@%p81 bra $L__BB0_137;
mov.u32 %r330, %r331;
$L__BB0_134:
setp.ge.u32 %p102, %r6, %r330;
@%p102 bra $L__BB0_136;
add.s32 %r299, %r330, %r27;
mul.wide.s32 %rd145, %r299, 8;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f64 %fd243, [%rd23];
ld.shared.f64 %fd244, [%rd147];
add.f64 %fd245, %fd244, %fd243;
st.shared.f64 [%rd23], %fd245;
$L__BB0_136:
bar.sync 0;
shr.u32 %r67, %r330, 1;
setp.gt.u32 %p103, %r330, 3;
mov.u32 %r330, %r67;
@%p103 bra $L__BB0_134;
$L__BB0_137:
mov.f64 %fd300, 0d0000000000000000;
@%p84 bra $L__BB0_140;
setp.lt.u32 %p105, %r3, 2;
ld.shared.f64 %fd247, [%rd23];
add.f64 %fd300, %fd247, 0d0000000000000000;
@%p105 bra $L__BB0_140;
ld.shared.f64 %fd248, [%rd31];
add.f64 %fd300, %fd300, %fd248;
$L__BB0_140:
bar.sync 0;
st.shared.f64 [%rd23], %fd299;
bar.sync 0;
@%p80 bra $L__BB0_142;
ld.shared.f64 %fd249, [%rd30];
ld.shared.f64 %fd250, [%rd23];
add.f64 %fd251, %fd249, %fd250;
st.shared.f64 [%rd23], %fd251;
$L__BB0_142:
bar.sync 0;
@%p81 bra $L__BB0_146;
$L__BB0_143:
setp.ge.u32 %p108, %r6, %r331;
@%p108 bra $L__BB0_145;
add.s32 %r300, %r331, %r27;
mul.wide.s32 %rd148, %r300, 8;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f64 %fd252, [%rd23];
ld.shared.f64 %fd253, [%rd150];
add.f64 %fd254, %fd253, %fd252;
st.shared.f64 [%rd23], %fd254;
$L__BB0_145:
bar.sync 0;
shr.u32 %r69, %r331, 1;
setp.gt.u32 %p109, %r331, 3;
mov.u32 %r331, %r69;
@%p109 bra $L__BB0_143;
$L__BB0_146:
mov.f64 %fd301, 0d0000000000000000;
@%p84 bra $L__BB0_149;
setp.lt.u32 %p111, %r3, 2;
ld.shared.f64 %fd256, [%rd23];
add.f64 %fd301, %fd256, 0d0000000000000000;
@%p111 bra $L__BB0_149;
ld.shared.f64 %fd257, [%rd31];
add.f64 %fd301, %fd301, %fd257;
$L__BB0_149:
bar.sync 0;
@%p84 bra $L__BB0_153;
add.s32 %r301, %r4, %r2;
add.s32 %r302, %r301, -1;
div.s32 %r303, %r302, %r4;
setp.ge.s32 %p113, %r38, %r303;
@%p113 bra $L__BB0_153;
shl.b32 %r70, %r8, 1;
mul.lo.s32 %r304, %r4, %r38;
shl.b32 %r71, %r304, 1;
add.s32 %r305, %r70, %r71;
or.b32 %r306, %r305, 1;
setp.ge.s32 %p114, %r306, %r109;
@%p114 bra $L__BB0_153;
ld.param.u64 %rd155, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_3_cu_840dfd4b_72339nvfuser_3ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r311, %r71, %r70;
mul.wide.s32 %rd152, %r311, 8;
add.s64 %rd151, %rd155, %rd152;
mov.b64 %rd153, %fd300;
mov.b64 {%r307, %r308}, %rd153;
mov.b64 %rd154, %fd301;
mov.b64 {%r309, %r310}, %rd154;
// begin inline asm
st.global.cs.v4.s32 [%rd151], {%r307,%r308,%r309,%r310};
// end inline asm
$L__BB0_153:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -29,175 +29,175 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<119>;
- .reg .b32 %r<334>;
+ .reg .b32 %r<332>;
.reg .f64 %fd<302>;
.reg .b64 %rd<158>;
- ld.param.v2.u32 {%r109, %r110}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r119, %r120}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r123, %r124}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r108, %r109}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r118, %r119}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r122, %r123}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r145, %r110, 1;
- shr.u32 %r146, %r145, 31;
- add.s32 %r147, %r145, %r146;
- shr.s32 %r2, %r147, 1;
+ add.s32 %r144, %r109, 1;
+ shr.u32 %r145, %r144, 31;
+ add.s32 %r146, %r144, %r145;
+ shr.s32 %r2, %r146, 1;
mov.u32 %r3, %ntid.x;
- max.s32 %r148, %r2, %r3;
+ max.s32 %r147, %r2, %r3;
mov.u32 %r4, %ntid.y;
- shl.b32 %r149, %r4, 3;
- mad.lo.s32 %r150, %r149, %r148, 15;
- and.b32 %r151, %r150, -16;
- cvt.u64.u32 %rd1, %r151;
- mul.lo.s32 %r152, %r4, %r2;
- shl.b32 %r153, %r152, 4;
- or.b32 %r154, %r153, 15;
- and.b32 %r5, %r154, -16;
- add.s32 %r155, %r154, %r5;
- and.b32 %r156, %r155, -16;
- cvt.s64.s32 %rd2, %r156;
+ shl.b32 %r148, %r4, 3;
+ mad.lo.s32 %r149, %r148, %r147, 15;
+ and.b32 %r150, %r149, -16;
+ cvt.u64.u32 %rd1, %r150;
+ mul.lo.s32 %r151, %r4, %r2;
+ shl.b32 %r152, %r151, 4;
+ or.b32 %r153, %r152, 15;
+ and.b32 %r5, %r153, -16;
+ add.s32 %r154, %r153, %r5;
+ and.b32 %r155, %r154, -16;
+ cvt.s64.s32 %rd2, %r155;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
- cvt.rn.f64.s32 %fd1, %r110;
+ cvt.rn.f64.s32 %fd1, %r109;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 1;
- or.b32 %r157, %r7, 1;
- setp.lt.s32 %p10, %r157, %r110;
+ or.b32 %r156, %r7, 1;
+ setp.lt.s32 %p10, %r156, %r109;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r158, smem_ptr; }
-
-
- shl.b32 %r161, %r6, 4;
- add.s32 %r159, %r158, %r161;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r157, smem_ptr; }
+
+
+ shl.b32 %r160, %r6, 4;
+ add.s32 %r158, %r157, %r160;
mul.wide.s32 %rd47, %r7, 8;
add.s64 %rd46, %rd36, %rd47;
- mov.u32 %r160, 0;
+ mov.u32 %r159, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r160, 0;
- cp.async.ca.shared.global [%r159], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r159, 0;
+ cp.async.ca.shared.global [%r158], [%rd46], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r162, %r4, 215;
- div.s32 %r163, %r162, %r4;
+ add.s32 %r161, %r4, 215;
+ div.s32 %r162, %r161, %r4;
mov.u32 %r9, %nctaid.y;
- add.s32 %r164, %r9, %r163;
- add.s32 %r165, %r164, -1;
- div.s32 %r10, %r165, %r9;
+ add.s32 %r163, %r9, %r162;
+ add.s32 %r164, %r163, -1;
+ div.s32 %r10, %r164, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
- mov.u32 %r167, %ctaid.y;
- mul.lo.s32 %r168, %r10, %r4;
- mul.lo.s32 %r11, %r168, %r167;
- shl.b32 %r169, %r8, 3;
- shl.b32 %r170, %r6, 4;
- mad.lo.s32 %r12, %r169, %r110, %r170;
- mul.lo.s32 %r171, %r110, %r8;
- cvt.s64.s32 %rd52, %r171;
+ mov.u32 %r166, %ctaid.y;
+ mul.lo.s32 %r167, %r10, %r4;
+ mul.lo.s32 %r11, %r167, %r166;
+ mad.lo.s32 %r168, %r2, %r8, %r6;
+ shl.b32 %r12, %r168, 4;
+ mul.lo.s32 %r169, %r109, %r8;
+ cvt.s64.s32 %rd52, %r169;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r172, %r11, %r110;
- cvt.s64.s32 %rd6, %r172;
- mul.lo.s32 %r13, %r110, %r4;
- mul.lo.s32 %r14, %r10, %r167;
- add.s32 %r15, %r171, %r7;
+ mul.lo.s32 %r170, %r11, %r109;
+ cvt.s64.s32 %rd6, %r170;
+ mul.lo.s32 %r13, %r109, %r4;
+ mul.lo.s32 %r14, %r10, %r166;
+ shl.b32 %r171, %r8, 1;
+ mov.u32 %r172, 1;
+ mad.lo.s32 %r173, %r171, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
- mul.wide.s32 %rd55, %r15, 8;
+ mul.wide.s32 %rd55, %r173, 8;
add.s64 %rd7, %rd54, %rd55;
- mov.u32 %r173, %tid.z;
- mad.lo.s32 %r174, %r4, %r173, %r8;
- mad.lo.s32 %r16, %r174, %r3, %r6;
- mul.wide.u32 %rd56, %r16, 8;
+ mov.u32 %r174, %tid.z;
+ mad.lo.s32 %r175, %r4, %r174, %r8;
+ mad.lo.s32 %r15, %r175, %r3, %r6;
+ mul.wide.u32 %rd56, %r15, 8;
add.s64 %rd8, %rd43, %rd56;
- clz.b32 %r175, %r3;
- mov.u32 %r176, 31;
- sub.s32 %r177, %r176, %r175;
- mov.u32 %r178, 1;
- shl.b32 %r17, %r178, %r177;
- setp.lt.u32 %p14, %r6, %r17;
- add.s32 %r179, %r17, %r6;
+ clz.b32 %r176, %r3;
+ mov.u32 %r177, 31;
+ sub.s32 %r178, %r177, %r176;
+ shl.b32 %r16, %r172, %r178;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r179, %r16, %r6;
setp.lt.u32 %p15, %r179, %r3;
and.pred %p3, %p14, %p15;
- add.s32 %r180, %r16, %r17;
+ add.s32 %r180, %r15, %r16;
mul.wide.s32 %rd57, %r180, 8;
add.s64 %rd9, %rd43, %rd57;
- shr.u32 %r181, %r17, 31;
- add.s32 %r182, %r17, %r181;
- shr.s32 %r18, %r182, 1;
+ shr.u32 %r181, %r16, 31;
+ add.s32 %r182, %r16, %r181;
+ shr.s32 %r17, %r182, 1;
add.s64 %rd10, %rd51, %rd55;
- add.s32 %r183, %r16, 1;
+ add.s32 %r183, %r15, 1;
mul.wide.u32 %rd58, %r183, 8;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd12, %rd59, %rd60;
- mul.wide.s32 %rd61, %r174, 8;
+ mul.wide.s32 %rd61, %r175, 8;
add.s64 %rd13, %rd43, %rd61;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
rcp.rn.f64 %fd2, %fd1;
- mov.u32 %r316, 0;
+ mov.u32 %r314, 0;
mov.f64 %fd272, 0d0000000000000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r186, smem_ptr; }
- add.s32 %r187, %r12, %r186;
+ add.s32 %r187, %r186, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r196, smem_ptr; }
- add.s32 %r197, %r12, %r196;
+ add.s32 %r197, %r196, %r12;
not.pred %p26, %p3;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r184, %r316, %r4, %r8;
+ mad.lo.s32 %r184, %r314, %r4, %r8;
add.s32 %r185, %r184, %r11;
setp.gt.s32 %p17, %r185, 215;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r189, %r13, %r316;
+ mul.lo.s32 %r189, %r13, %r314;
cvt.s64.s32 %rd65, %r189;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd32, %rd68;
@@ -216,11 +216,11 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r190, %r14, %r316;
+ add.s32 %r190, %r14, %r314;
mad.lo.s32 %r191, %r190, %r4, %r8;
setp.lt.s32 %p19, %r191, 216;
@%p19 bra $L__BB0_13;
bra.uni $L__BB0_10;
@@ -233,38 +233,38 @@
mov.f64 %fd269, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
- add.s32 %r192, %r14, %r316;
+ add.s32 %r192, %r14, %r314;
mad.lo.s32 %r193, %r192, %r4, %r8;
setp.gt.s32 %p20, %r193, 215;
@%p20 bra $L__BB0_14;
ld.shared.v2.f64 {%fd268, %fd269}, [%rd7];
$L__BB0_14:
- add.s32 %r194, %r14, %r316;
- mad.lo.s32 %r22, %r194, %r4, %r8;
+ add.s32 %r194, %r14, %r314;
+ mad.lo.s32 %r21, %r194, %r4, %r8;
add.f64 %fd281, %fd281, %fd269;
add.f64 %fd280, %fd280, %fd268;
- setp.gt.s32 %p21, %r22, 215;
+ setp.gt.s32 %p21, %r21, 215;
mov.f64 %fd270, 0d0000000000000000;
@%p21 bra $L__BB0_16;
- mul.lo.s32 %r195, %r22, %r119;
+ mul.lo.s32 %r195, %r21, %r118;
mul.wide.s32 %rd69, %r195, 8;
add.s64 %rd70, %rd16, %rd69;
ld.global.f64 %fd270, [%rd70];
$L__BB0_16:
- setp.lt.s32 %p22, %r22, 216;
+ setp.lt.s32 %p22, %r21, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_18;
- mul.lo.s32 %r199, %r13, %r316;
+ mul.lo.s32 %r199, %r13, %r314;
cvt.s64.s32 %rd73, %r199;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd33, %rd76;
@@ -277,18 +277,18 @@
}
$L__BB0_18:
- add.s32 %r315, %r14, %r316;
- mad.lo.s32 %r314, %r315, %r4, %r8;
- setp.gt.s32 %p117, %r314, 215;
+ add.s32 %r313, %r14, %r314;
+ mad.lo.s32 %r312, %r313, %r4, %r8;
+ setp.gt.s32 %p117, %r312, 215;
mov.f64 %fd274, 0d0000000000000000;
mov.f64 %fd271, %fd274;
@%p117 bra $L__BB0_20;
- mul.lo.s32 %r200, %r22, %r123;
+ mul.lo.s32 %r200, %r21, %r122;
mul.wide.s32 %rd77, %r200, 8;
add.s64 %rd78, %rd17, %rd77;
ld.global.f64 %fd271, [%rd78];
$L__BB0_20:
@@ -324,33 +324,33 @@
ld.shared.f64 %fd124, [%rd8];
add.f64 %fd125, %fd123, %fd124;
st.shared.f64 [%rd8], %fd125;
$L__BB0_24:
- setp.lt.s32 %p27, %r17, 4;
+ setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_29;
- mov.u32 %r317, %r18;
+ mov.u32 %r315, %r17;
$L__BB0_26:
- setp.ge.u32 %p28, %r6, %r317;
+ setp.ge.u32 %p28, %r6, %r315;
@%p28 bra $L__BB0_28;
- add.s32 %r201, %r317, %r16;
+ add.s32 %r201, %r315, %r15;
mul.wide.s32 %rd79, %r201, 8;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f64 %fd126, [%rd8];
ld.shared.f64 %fd127, [%rd81];
add.f64 %fd128, %fd127, %fd126;
st.shared.f64 [%rd8], %fd128;
$L__BB0_28:
bar.sync 0;
- shr.u32 %r24, %r317, 1;
- setp.gt.u32 %p29, %r317, 3;
- mov.u32 %r317, %r24;
+ shr.u32 %r23, %r315, 1;
+ setp.gt.u32 %p29, %r315, 3;
+ mov.u32 %r315, %r23;
@%p29 bra $L__BB0_26;
$L__BB0_29:
setp.ne.s32 %p30, %r6, 0;
mov.f64 %fd276, 0d0000000000000000;
@@ -374,33 +374,33 @@
ld.shared.f64 %fd133, [%rd8];
add.f64 %fd134, %fd132, %fd133;
st.shared.f64 [%rd8], %fd134;
$L__BB0_34:
- setp.lt.s32 %p118, %r17, 4;
+ setp.lt.s32 %p118, %r16, 4;
bar.sync 0;
@%p118 bra $L__BB0_39;
- mov.u32 %r318, %r18;
+ mov.u32 %r316, %r17;
$L__BB0_36:
- setp.ge.u32 %p34, %r6, %r318;
+ setp.ge.u32 %p34, %r6, %r316;
@%p34 bra $L__BB0_38;
- add.s32 %r202, %r318, %r16;
+ add.s32 %r202, %r316, %r15;
mul.wide.s32 %rd82, %r202, 8;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f64 %fd135, [%rd8];
ld.shared.f64 %fd136, [%rd84];
add.f64 %fd137, %fd136, %fd135;
st.shared.f64 [%rd8], %fd137;
$L__BB0_38:
bar.sync 0;
- shr.u32 %r26, %r318, 1;
- setp.gt.u32 %p35, %r318, 3;
- mov.u32 %r318, %r26;
+ shr.u32 %r25, %r316, 1;
+ setp.gt.u32 %p35, %r316, 3;
+ mov.u32 %r316, %r25;
@%p35 bra $L__BB0_36;
$L__BB0_39:
mov.f64 %fd277, 0d0000000000000000;
@%p30 bra $L__BB0_42;
@@ -453,54 +453,53 @@
sub.f64 %fd166, %fd163, %fd33;
mul.f64 %fd167, %fd34, %fd165;
sub.f64 %fd168, %fd166, %fd167;
mul.f64 %fd169, %fd141, %fd168;
mov.b64 %rd87, %fd169;
- mad.lo.s32 %r207, %r316, %r4, %r11;
- mad.lo.s32 %r208, %r207, %r110, %r15;
- mul.wide.s32 %rd88, %r208, 8;
+ mad.lo.s32 %r207, %r21, %r109, %r7;
+ mul.wide.s32 %rd88, %r207, 8;
add.s64 %rd85, %rd37, %rd88;
mov.b64 {%r203, %r204}, %rd86;
mov.b64 {%r205, %r206}, %rd87;
st.global.cs.v4.s32 [%rd85], {%r203,%r204,%r205,%r206};
$L__BB0_48:
- add.s32 %r316, %r316, 1;
- setp.lt.s32 %p41, %r316, %r10;
+ add.s32 %r314, %r314, 1;
+ setp.lt.s32 %p41, %r314, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_49;
$L__BB0_3:
mov.f64 %fd272, 0d0000000000000000;
mov.f64 %fd273, %fd272;
mov.f64 %fd280, %fd272;
mov.f64 %fd281, %fd272;
$L__BB0_49:
- mov.u32 %r209, %tid.z;
- mad.lo.s32 %r210, %r4, %r209, %r8;
- mad.lo.s32 %r28, %r210, %r3, %r6;
- mul.wide.u32 %rd89, %r28, 8;
+ mov.u32 %r208, %tid.z;
+ mad.lo.s32 %r209, %r4, %r208, %r8;
+ mad.lo.s32 %r27, %r209, %r3, %r6;
+ mul.wide.u32 %rd89, %r27, 8;
add.s64 %rd23, %rd43, %rd89;
- clz.b32 %r211, %r4;
- mov.u32 %r212, 31;
- sub.s32 %r213, %r212, %r211;
- mov.u32 %r214, 1;
- shl.b32 %r29, %r214, %r213;
- setp.lt.u32 %p42, %r8, %r29;
- add.s32 %r215, %r29, %r8;
- setp.lt.u32 %p43, %r215, %r4;
+ clz.b32 %r210, %r4;
+ mov.u32 %r211, 31;
+ sub.s32 %r212, %r211, %r210;
+ mov.u32 %r213, 1;
+ shl.b32 %r28, %r213, %r212;
+ setp.lt.u32 %p42, %r8, %r28;
+ add.s32 %r214, %r28, %r8;
+ setp.lt.u32 %p43, %r214, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r216, %r3, %r213;
- add.s32 %r217, %r28, %r216;
- mul.wide.s32 %rd91, %r217, 8;
+ shl.b32 %r215, %r3, %r212;
+ add.s32 %r216, %r27, %r215;
+ mul.wide.s32 %rd91, %r216, 8;
add.s64 %rd24, %rd43, %rd91;
- shr.u32 %r218, %r29, 31;
- add.s32 %r219, %r29, %r218;
- shr.s32 %r322, %r219, 1;
+ shr.u32 %r217, %r28, 31;
+ add.s32 %r218, %r28, %r217;
+ shr.s32 %r320, %r218, 1;
st.shared.f64 [%rd23], %fd272;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_51;
@@ -508,38 +507,38 @@
ld.shared.f64 %fd171, [%rd23];
add.f64 %fd172, %fd170, %fd171;
st.shared.f64 [%rd23], %fd172;
$L__BB0_51:
- setp.lt.s32 %p45, %r29, 4;
+ setp.lt.s32 %p45, %r28, 4;
bar.sync 0;
@%p45 bra $L__BB0_56;
- mov.u32 %r319, %r322;
+ mov.u32 %r317, %r320;
$L__BB0_53:
- setp.ge.u32 %p46, %r8, %r319;
+ setp.ge.u32 %p46, %r8, %r317;
@%p46 bra $L__BB0_55;
- mad.lo.s32 %r220, %r319, %r3, %r28;
- mul.wide.s32 %rd92, %r220, 8;
+ mad.lo.s32 %r219, %r317, %r3, %r27;
+ mul.wide.s32 %rd92, %r219, 8;
add.s64 %rd94, %rd43, %rd92;
ld.shared.f64 %fd173, [%rd23];
ld.shared.f64 %fd174, [%rd94];
add.f64 %fd175, %fd174, %fd173;
st.shared.f64 [%rd23], %fd175;
$L__BB0_55:
bar.sync 0;
- shr.u32 %r32, %r319, 1;
- setp.gt.u32 %p47, %r319, 3;
- mov.u32 %r319, %r32;
+ shr.u32 %r31, %r317, 1;
+ setp.gt.u32 %p47, %r317, 3;
+ mov.u32 %r317, %r31;
@%p47 bra $L__BB0_53;
$L__BB0_56:
- add.s32 %r221, %r28, %r3;
- mul.wide.u32 %rd95, %r221, 8;
+ add.s32 %r220, %r27, %r3;
+ mul.wide.u32 %rd95, %r220, 8;
add.s64 %rd25, %rd43, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.f64 %fd282, 0d0000000000000000;
@%p48 bra $L__BB0_59;
@@ -564,29 +563,29 @@
$L__BB0_61:
bar.sync 0;
@%p45 bra $L__BB0_66;
- mov.u32 %r320, %r322;
+ mov.u32 %r318, %r320;
$L__BB0_63:
- setp.ge.u32 %p52, %r8, %r320;
+ setp.ge.u32 %p52, %r8, %r318;
@%p52 bra $L__BB0_65;
- mad.lo.s32 %r222, %r320, %r3, %r28;
- mul.wide.s32 %rd97, %r222, 8;
+ mad.lo.s32 %r221, %r318, %r3, %r27;
+ mul.wide.s32 %rd97, %r221, 8;
add.s64 %rd99, %rd43, %rd97;
ld.shared.f64 %fd182, [%rd23];
ld.shared.f64 %fd183, [%rd99];
add.f64 %fd184, %fd183, %fd182;
st.shared.f64 [%rd23], %fd184;
$L__BB0_65:
bar.sync 0;
- shr.u32 %r34, %r320, 1;
- setp.gt.u32 %p53, %r320, 3;
- mov.u32 %r320, %r34;
+ shr.u32 %r33, %r318, 1;
+ setp.gt.u32 %p53, %r318, 3;
+ mov.u32 %r318, %r33;
@%p53 bra $L__BB0_63;
$L__BB0_66:
mov.f64 %fd283, 0d0000000000000000;
@%p48 bra $L__BB0_69;
@@ -612,29 +611,29 @@
$L__BB0_71:
bar.sync 0;
@%p45 bra $L__BB0_76;
- mov.u32 %r321, %r322;
+ mov.u32 %r319, %r320;
$L__BB0_73:
- setp.ge.u32 %p58, %r8, %r321;
+ setp.ge.u32 %p58, %r8, %r319;
@%p58 bra $L__BB0_75;
- mad.lo.s32 %r223, %r321, %r3, %r28;
- mul.wide.s32 %rd100, %r223, 8;
+ mad.lo.s32 %r222, %r319, %r3, %r27;
+ mul.wide.s32 %rd100, %r222, 8;
add.s64 %rd102, %rd43, %rd100;
ld.shared.f64 %fd191, [%rd23];
ld.shared.f64 %fd192, [%rd102];
add.f64 %fd193, %fd192, %fd191;
st.shared.f64 [%rd23], %fd193;
$L__BB0_75:
bar.sync 0;
- shr.u32 %r36, %r321, 1;
- setp.gt.u32 %p59, %r321, 3;
- mov.u32 %r321, %r36;
+ shr.u32 %r35, %r319, 1;
+ setp.gt.u32 %p59, %r319, 3;
+ mov.u32 %r319, %r35;
@%p59 bra $L__BB0_73;
$L__BB0_76:
mov.f64 %fd284, 0d0000000000000000;
@%p48 bra $L__BB0_79;
@@ -661,26 +660,26 @@
$L__BB0_81:
bar.sync 0;
@%p45 bra $L__BB0_85;
$L__BB0_82:
- setp.ge.u32 %p64, %r8, %r322;
+ setp.ge.u32 %p64, %r8, %r320;
@%p64 bra $L__BB0_84;
- mad.lo.s32 %r224, %r322, %r3, %r28;
- mul.wide.s32 %rd103, %r224, 8;
+ mad.lo.s32 %r223, %r320, %r3, %r27;
+ mul.wide.s32 %rd103, %r223, 8;
add.s64 %rd105, %rd43, %rd103;
ld.shared.f64 %fd200, [%rd23];
ld.shared.f64 %fd201, [%rd105];
add.f64 %fd202, %fd201, %fd200;
st.shared.f64 [%rd23], %fd202;
$L__BB0_84:
bar.sync 0;
- shr.u32 %r38, %r322, 1;
- setp.gt.u32 %p65, %r322, 3;
- mov.u32 %r322, %r38;
+ shr.u32 %r37, %r320, 1;
+ setp.gt.u32 %p65, %r320, 3;
+ mov.u32 %r320, %r37;
@%p65 bra $L__BB0_82;
$L__BB0_85:
mov.f64 %fd285, 0d0000000000000000;
@%p48 bra $L__BB0_88;
@@ -699,151 +698,150 @@
bar.sync 0;
@%p115 bra $L__BB0_89;
bra.uni $L__BB0_90;
$L__BB0_89:
- shl.b32 %r313, %r6, 1;
- mov.u32 %r233, %ctaid.y;
- mad.lo.s32 %r234, %r110, %r233, %r313;
- mul.wide.s32 %rd108, %r234, 8;
+ mov.u32 %r232, %ctaid.y;
+ mad.lo.s32 %r233, %r109, %r232, %r7;
+ mul.wide.s32 %rd108, %r233, 8;
add.s64 %rd106, %rd40, %rd108;
mov.b64 %rd109, %fd282;
- mov.b64 {%r225, %r226}, %rd109;
+ mov.b64 {%r224, %r225}, %rd109;
mov.b64 %rd110, %fd283;
- mov.b64 {%r227, %r228}, %rd110;
-
- st.volatile.global.v4.s32 [%rd106], {%r225,%r226,%r227,%r228};
+ mov.b64 {%r226, %r227}, %rd110;
+
+ st.volatile.global.v4.s32 [%rd106], {%r224,%r225,%r226,%r227};
add.s64 %rd107, %rd41, %rd108;
mov.b64 %rd111, %fd284;
- mov.b64 {%r229, %r230}, %rd111;
+ mov.b64 {%r228, %r229}, %rd111;
mov.b64 %rd112, %fd285;
- mov.b64 {%r231, %r232}, %rd112;
-
- st.volatile.global.v4.s32 [%rd107], {%r229,%r230,%r231,%r232};
+ mov.b64 {%r230, %r231}, %rd112;
+
+ st.volatile.global.v4.s32 [%rd107], {%r228,%r229,%r230,%r231};
$L__BB0_90:
- mov.u32 %r39, %ctaid.y;
+ mov.u32 %r38, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r235, %r6, %r8;
- or.b32 %r237, %r235, %r209;
- setp.ne.s32 %p68, %r237, 0;
+ or.b32 %r234, %r6, %r8;
+ or.b32 %r236, %r234, %r208;
+ setp.ne.s32 %p68, %r236, 0;
@%p68 bra $L__BB0_94;
ld.param.u64 %rd157, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd157;
- mov.u32 %r238, %ctaid.x;
- mov.u32 %r239, %ctaid.z;
- mov.u32 %r240, %nctaid.x;
- mad.lo.s32 %r241, %r239, %r240, %r238;
- mul.wide.s32 %rd114, %r241, 8;
+ mov.u32 %r237, %ctaid.x;
+ mov.u32 %r238, %ctaid.z;
+ mov.u32 %r239, %nctaid.x;
+ mad.lo.s32 %r240, %r238, %r239, %r237;
+ mul.wide.s32 %rd114, %r240, 8;
add.s64 %rd28, %rd113, %rd114;
- add.s32 %r242, %r9, -1;
- setp.eq.s32 %p69, %r39, %r242;
+ add.s32 %r241, %r9, -1;
+ setp.eq.s32 %p69, %r38, %r241;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p69;
atom.global.add.u64 %rd29, [%rd28], %rd118;
ld.volatile.global.u64 %rd119, [%rd28];
xor.b64 %rd120, %rd119, %rd29;
setp.lt.s64 %p70, %rd120, 0;
@%p70 bra $L__BB0_94;
- mov.u32 %r323, 8;
+ mov.u32 %r321, 8;
$L__BB0_93:
- nanosleep.u32 %r323;
-
- setp.lt.u32 %p71, %r323, 256;
- selp.u32 %r245, 1, 0, %p71;
- shl.b32 %r323, %r323, %r245;
+ nanosleep.u32 %r321;
+
+ setp.lt.u32 %p71, %r321, 256;
+ selp.u32 %r244, 1, 0, %p71;
+ shl.b32 %r321, %r321, %r244;
ld.volatile.global.u64 %rd121, [%rd28];
xor.b64 %rd122, %rd121, %rd29;
setp.gt.s64 %p72, %rd122, -1;
@%p72 bra $L__BB0_93;
$L__BB0_94:
bar.sync 0;
- add.s32 %r246, %r9, %r3;
- add.s32 %r247, %r246, -1;
- div.s32 %r42, %r247, %r3;
- setp.lt.s32 %p73, %r42, 1;
+ add.s32 %r245, %r9, %r3;
+ add.s32 %r246, %r245, -1;
+ div.s32 %r41, %r246, %r3;
+ setp.lt.s32 %p73, %r41, 1;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd291, %fd290;
@%p73 bra $L__BB0_100;
- add.s32 %r249, %r4, %r2;
- add.s32 %r250, %r249, -1;
- shl.b32 %r251, %r8, 1;
- shl.b32 %r252, %r4, 1;
- mad.lo.s32 %r253, %r252, %r39, %r251;
- or.b32 %r254, %r253, 1;
- setp.ge.s32 %p74, %r254, %r110;
- div.s32 %r255, %r250, %r4;
- setp.ge.s32 %p75, %r39, %r255;
+ add.s32 %r248, %r4, %r2;
+ add.s32 %r249, %r248, -1;
+ shl.b32 %r250, %r8, 1;
+ shl.b32 %r251, %r4, 1;
+ mad.lo.s32 %r252, %r251, %r38, %r250;
+ or.b32 %r253, %r252, 1;
+ setp.ge.s32 %p74, %r253, %r109;
+ div.s32 %r254, %r249, %r4;
+ setp.ge.s32 %p75, %r38, %r254;
or.pred %p6, %p75, %p74;
- mul.lo.s32 %r256, %r4, %r39;
- shl.b32 %r257, %r256, 1;
- mad.lo.s32 %r258, %r110, %r6, %r257;
- add.s32 %r325, %r258, %r251;
- mul.lo.s32 %r44, %r110, %r3;
- mov.u32 %r326, 0;
+ mul.lo.s32 %r255, %r4, %r38;
+ shl.b32 %r256, %r255, 1;
+ mad.lo.s32 %r257, %r109, %r6, %r256;
+ add.s32 %r323, %r257, %r250;
+ mul.lo.s32 %r43, %r109, %r3;
+ mov.u32 %r324, 0;
mov.f64 %fd209, 0d0000000000000000;
- mov.u32 %r324, %r6;
+ mov.u32 %r322, %r6;
mov.f64 %fd290, %fd209;
mov.f64 %fd291, %fd209;
$L__BB0_96:
.pragma "nounroll";
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p6 bra $L__BB0_99;
- setp.ge.s32 %p76, %r324, %r9;
+ setp.ge.s32 %p76, %r322, %r9;
mov.f64 %fd288, %fd209;
mov.f64 %fd289, %fd209;
@%p76 bra $L__BB0_99;
- mul.wide.s32 %rd124, %r325, 8;
+ mul.wide.s32 %rd124, %r323, 8;
add.s64 %rd123, %rd41, %rd124;
- ld.volatile.global.v4.s32 {%r259,%r260,%r261,%r262}, [%rd123];
-
- mov.b64 %rd125, {%r259, %r260};
+ ld.volatile.global.v4.s32 {%r258,%r259,%r260,%r261}, [%rd123];
+
+ mov.b64 %rd125, {%r258, %r259};
mov.b64 %fd289, %rd125;
- mov.b64 %rd126, {%r261, %r262};
+ mov.b64 %rd126, {%r260, %r261};
mov.b64 %fd288, %rd126;
$L__BB0_99:
add.f64 %fd290, %fd290, %fd289;
add.f64 %fd291, %fd291, %fd288;
- add.s32 %r325, %r325, %r44;
- add.s32 %r324, %r324, %r3;
- add.s32 %r326, %r326, 1;
- setp.lt.s32 %p77, %r326, %r42;
+ add.s32 %r323, %r323, %r43;
+ add.s32 %r322, %r322, %r3;
+ add.s32 %r324, %r324, 1;
+ setp.lt.s32 %p77, %r324, %r41;
@%p77 bra $L__BB0_96;
$L__BB0_100:
- clz.b32 %r263, %r3;
- mov.u32 %r264, 31;
- sub.s32 %r265, %r264, %r263;
- mov.u32 %r266, 1;
- shl.b32 %r51, %r266, %r265;
- setp.lt.u32 %p78, %r6, %r51;
- add.s32 %r267, %r51, %r6;
- setp.lt.u32 %p79, %r267, %r3;
+ clz.b32 %r262, %r3;
+ mov.u32 %r263, 31;
+ sub.s32 %r264, %r263, %r262;
+ mov.u32 %r265, 1;
+ shl.b32 %r50, %r265, %r264;
+ setp.lt.u32 %p78, %r6, %r50;
+ add.s32 %r266, %r50, %r6;
+ setp.lt.u32 %p79, %r266, %r3;
and.pred %p7, %p78, %p79;
- add.s32 %r268, %r28, %r51;
- mul.wide.s32 %rd127, %r268, 8;
+ add.s32 %r267, %r27, %r50;
+ mul.wide.s32 %rd127, %r267, 8;
add.s64 %rd30, %rd43, %rd127;
- shr.u32 %r269, %r51, 31;
- add.s32 %r270, %r51, %r269;
- shr.s32 %r333, %r270, 1;
+ shr.u32 %r268, %r50, 31;
+ add.s32 %r269, %r50, %r268;
+ shr.s32 %r331, %r269, 1;
st.shared.f64 [%rd23], %fd290;
bar.sync 0;
not.pred %p80, %p7;
@%p80 bra $L__BB0_102;
@@ -851,38 +849,38 @@
ld.shared.f64 %fd215, [%rd23];
add.f64 %fd216, %fd214, %fd215;
st.shared.f64 [%rd23], %fd216;
$L__BB0_102:
- setp.lt.s32 %p81, %r51, 4;
+ setp.lt.s32 %p81, %r50, 4;
bar.sync 0;
@%p81 bra $L__BB0_107;
- mov.u32 %r327, %r333;
+ mov.u32 %r325, %r331;
$L__BB0_104:
- setp.ge.u32 %p82, %r6, %r327;
+ setp.ge.u32 %p82, %r6, %r325;
@%p82 bra $L__BB0_106;
- add.s32 %r271, %r327, %r28;
- mul.wide.s32 %rd129, %r271, 8;
+ add.s32 %r270, %r325, %r27;
+ mul.wide.s32 %rd129, %r270, 8;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f64 %fd217, [%rd23];
ld.shared.f64 %fd218, [%rd131];
add.f64 %fd219, %fd218, %fd217;
st.shared.f64 [%rd23], %fd219;
$L__BB0_106:
bar.sync 0;
- shr.u32 %r54, %r327, 1;
- setp.gt.u32 %p83, %r327, 3;
- mov.u32 %r327, %r54;
+ shr.u32 %r53, %r325, 1;
+ setp.gt.u32 %p83, %r325, 3;
+ mov.u32 %r325, %r53;
@%p83 bra $L__BB0_104;
$L__BB0_107:
- add.s32 %r272, %r28, 1;
- mul.wide.u32 %rd132, %r272, 8;
+ add.s32 %r271, %r27, 1;
+ mul.wide.u32 %rd132, %r271, 8;
add.s64 %rd31, %rd43, %rd132;
setp.ne.s32 %p84, %r6, 0;
mov.f64 %fd292, 0d0000000000000000;
@%p84 bra $L__BB0_110;
@@ -907,29 +905,29 @@
$L__BB0_112:
bar.sync 0;
@%p81 bra $L__BB0_117;
- mov.u32 %r328, %r333;
+ mov.u32 %r326, %r331;
$L__BB0_114:
- setp.ge.u32 %p88, %r6, %r328;
+ setp.ge.u32 %p88, %r6, %r326;
@%p88 bra $L__BB0_116;
- add.s32 %r273, %r328, %r28;
- mul.wide.s32 %rd134, %r273, 8;
+ add.s32 %r272, %r326, %r27;
+ mul.wide.s32 %rd134, %r272, 8;
add.s64 %rd136, %rd43, %rd134;
ld.shared.f64 %fd226, [%rd23];
ld.shared.f64 %fd227, [%rd136];
add.f64 %fd228, %fd227, %fd226;
st.shared.f64 [%rd23], %fd228;
$L__BB0_116:
bar.sync 0;
- shr.u32 %r56, %r328, 1;
- setp.gt.u32 %p89, %r328, 3;
- mov.u32 %r328, %r56;
+ shr.u32 %r55, %r326, 1;
+ setp.gt.u32 %p89, %r326, 3;
+ mov.u32 %r326, %r55;
@%p89 bra $L__BB0_114;
$L__BB0_117:
mov.f64 %fd293, 0d0000000000000000;
@%p84 bra $L__BB0_120;
@@ -944,90 +942,90 @@
$L__BB0_120:
bar.sync 0;
@%p84 bra $L__BB0_124;
- add.s32 %r274, %r4, %r2;
- add.s32 %r275, %r274, -1;
- div.s32 %r276, %r275, %r4;
- setp.ge.s32 %p93, %r39, %r276;
+ add.s32 %r273, %r4, %r2;
+ add.s32 %r274, %r273, -1;
+ div.s32 %r275, %r274, %r4;
+ setp.ge.s32 %p93, %r38, %r275;
@%p93 bra $L__BB0_124;
- shl.b32 %r57, %r8, 1;
- mul.lo.s32 %r277, %r4, %r39;
- shl.b32 %r58, %r277, 1;
- add.s32 %r278, %r57, %r58;
- or.b32 %r279, %r278, 1;
- setp.ge.s32 %p94, %r279, %r110;
+ shl.b32 %r56, %r8, 1;
+ mul.lo.s32 %r276, %r4, %r38;
+ shl.b32 %r57, %r276, 1;
+ add.s32 %r277, %r56, %r57;
+ or.b32 %r278, %r277, 1;
+ setp.ge.s32 %p94, %r278, %r109;
@%p94 bra $L__BB0_124;
ld.param.u64 %rd156, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r284, %r58, %r57;
- mul.wide.s32 %rd138, %r284, 8;
+ add.s32 %r283, %r57, %r56;
+ mul.wide.s32 %rd138, %r283, 8;
add.s64 %rd137, %rd156, %rd138;
mov.b64 %rd139, %fd292;
- mov.b64 {%r280, %r281}, %rd139;
+ mov.b64 {%r279, %r280}, %rd139;
mov.b64 %rd140, %fd293;
- mov.b64 {%r282, %r283}, %rd140;
-
- st.global.cs.v4.s32 [%rd137], {%r280,%r281,%r282,%r283};
+ mov.b64 {%r281, %r282}, %rd140;
+
+ st.global.cs.v4.s32 [%rd137], {%r279,%r280,%r281,%r282};
$L__BB0_124:
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd299, %fd298;
@%p73 bra $L__BB0_130;
- add.s32 %r286, %r4, %r2;
- add.s32 %r287, %r286, -1;
- shl.b32 %r288, %r8, 1;
- shl.b32 %r289, %r4, 1;
- mad.lo.s32 %r290, %r289, %r39, %r288;
- or.b32 %r291, %r290, 1;
- setp.ge.s32 %p96, %r291, %r110;
- div.s32 %r292, %r287, %r4;
- setp.ge.s32 %p97, %r39, %r292;
+ add.s32 %r285, %r4, %r2;
+ add.s32 %r286, %r285, -1;
+ shl.b32 %r287, %r8, 1;
+ shl.b32 %r288, %r4, 1;
+ mad.lo.s32 %r289, %r288, %r38, %r287;
+ or.b32 %r290, %r289, 1;
+ setp.ge.s32 %p96, %r290, %r109;
+ div.s32 %r291, %r286, %r4;
+ setp.ge.s32 %p97, %r38, %r291;
or.pred %p8, %p97, %p96;
- mul.lo.s32 %r293, %r4, %r39;
- shl.b32 %r294, %r293, 1;
- mad.lo.s32 %r295, %r110, %r6, %r294;
- add.s32 %r330, %r295, %r288;
- mul.lo.s32 %r60, %r110, %r3;
- mov.u32 %r331, 0;
+ mul.lo.s32 %r292, %r4, %r38;
+ shl.b32 %r293, %r292, 1;
+ mad.lo.s32 %r294, %r109, %r6, %r293;
+ add.s32 %r328, %r294, %r287;
+ mul.lo.s32 %r59, %r109, %r3;
+ mov.u32 %r329, 0;
mov.f64 %fd235, 0d0000000000000000;
- mov.u32 %r329, %r6;
+ mov.u32 %r327, %r6;
mov.f64 %fd298, %fd235;
mov.f64 %fd299, %fd235;
$L__BB0_126:
.pragma "nounroll";
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p8 bra $L__BB0_129;
- setp.ge.s32 %p98, %r329, %r9;
+ setp.ge.s32 %p98, %r327, %r9;
mov.f64 %fd296, %fd235;
mov.f64 %fd297, %fd235;
@%p98 bra $L__BB0_129;
- mul.wide.s32 %rd142, %r330, 8;
+ mul.wide.s32 %rd142, %r328, 8;
add.s64 %rd141, %rd40, %rd142;
- ld.volatile.global.v4.s32 {%r296,%r297,%r298,%r299}, [%rd141];
-
- mov.b64 %rd143, {%r296, %r297};
+ ld.volatile.global.v4.s32 {%r295,%r296,%r297,%r298}, [%rd141];
+
+ mov.b64 %rd143, {%r295, %r296};
mov.b64 %fd297, %rd143;
- mov.b64 %rd144, {%r298, %r299};
+ mov.b64 %rd144, {%r297, %r298};
mov.b64 %fd296, %rd144;
$L__BB0_129:
add.f64 %fd298, %fd298, %fd297;
add.f64 %fd299, %fd299, %fd296;
- add.s32 %r330, %r330, %r60;
- add.s32 %r329, %r329, %r3;
- add.s32 %r331, %r331, 1;
- setp.lt.s32 %p99, %r331, %r42;
+ add.s32 %r328, %r328, %r59;
+ add.s32 %r327, %r327, %r3;
+ add.s32 %r329, %r329, 1;
+ setp.lt.s32 %p99, %r329, %r41;
@%p99 bra $L__BB0_126;
$L__BB0_130:
st.shared.f64 [%rd23], %fd298;
bar.sync 0;
@@ -1040,29 +1038,29 @@
$L__BB0_132:
bar.sync 0;
@%p81 bra $L__BB0_137;
- mov.u32 %r332, %r333;
+ mov.u32 %r330, %r331;
$L__BB0_134:
- setp.ge.u32 %p102, %r6, %r332;
+ setp.ge.u32 %p102, %r6, %r330;
@%p102 bra $L__BB0_136;
- add.s32 %r300, %r332, %r28;
- mul.wide.s32 %rd145, %r300, 8;
+ add.s32 %r299, %r330, %r27;
+ mul.wide.s32 %rd145, %r299, 8;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f64 %fd243, [%rd23];
ld.shared.f64 %fd244, [%rd147];
add.f64 %fd245, %fd244, %fd243;
st.shared.f64 [%rd23], %fd245;
$L__BB0_136:
bar.sync 0;
- shr.u32 %r68, %r332, 1;
- setp.gt.u32 %p103, %r332, 3;
- mov.u32 %r332, %r68;
+ shr.u32 %r67, %r330, 1;
+ setp.gt.u32 %p103, %r330, 3;
+ mov.u32 %r330, %r67;
@%p103 bra $L__BB0_134;
$L__BB0_137:
mov.f64 %fd300, 0d0000000000000000;
@%p84 bra $L__BB0_140;
@@ -1089,26 +1087,26 @@
$L__BB0_142:
bar.sync 0;
@%p81 bra $L__BB0_146;
$L__BB0_143:
- setp.ge.u32 %p108, %r6, %r333;
+ setp.ge.u32 %p108, %r6, %r331;
@%p108 bra $L__BB0_145;
- add.s32 %r301, %r333, %r28;
- mul.wide.s32 %rd148, %r301, 8;
+ add.s32 %r300, %r331, %r27;
+ mul.wide.s32 %rd148, %r300, 8;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f64 %fd252, [%rd23];
ld.shared.f64 %fd253, [%rd150];
add.f64 %fd254, %fd253, %fd252;
st.shared.f64 [%rd23], %fd254;
$L__BB0_145:
bar.sync 0;
- shr.u32 %r70, %r333, 1;
- setp.gt.u32 %p109, %r333, 3;
- mov.u32 %r333, %r70;
+ shr.u32 %r69, %r331, 1;
+ setp.gt.u32 %p109, %r331, 3;
+ mov.u32 %r331, %r69;
@%p109 bra $L__BB0_143;
$L__BB0_146:
mov.f64 %fd301, 0d0000000000000000;
@%p84 bra $L__BB0_149;
@@ -1123,34 +1121,34 @@
$L__BB0_149:
bar.sync 0;
@%p84 bra $L__BB0_153;
- add.s32 %r302, %r4, %r2;
- add.s32 %r303, %r302, -1;
- div.s32 %r304, %r303, %r4;
- setp.ge.s32 %p113, %r39, %r304;
+ add.s32 %r301, %r4, %r2;
+ add.s32 %r302, %r301, -1;
+ div.s32 %r303, %r302, %r4;
+ setp.ge.s32 %p113, %r38, %r303;
@%p113 bra $L__BB0_153;
- shl.b32 %r71, %r8, 1;
- mul.lo.s32 %r305, %r4, %r39;
- shl.b32 %r72, %r305, 1;
- add.s32 %r306, %r71, %r72;
- or.b32 %r307, %r306, 1;
- setp.ge.s32 %p114, %r307, %r110;
+ shl.b32 %r70, %r8, 1;
+ mul.lo.s32 %r304, %r4, %r38;
+ shl.b32 %r71, %r304, 1;
+ add.s32 %r305, %r70, %r71;
+ or.b32 %r306, %r305, 1;
+ setp.ge.s32 %p114, %r306, %r109;
@%p114 bra $L__BB0_153;
ld.param.u64 %rd155, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r312, %r72, %r71;
- mul.wide.s32 %rd152, %r312, 8;
+ add.s32 %r311, %r71, %r70;
+ mul.wide.s32 %rd152, %r311, 8;
add.s64 %rd151, %rd155, %rd152;
mov.b64 %rd153, %fd300;
- mov.b64 {%r308, %r309}, %rd153;
+ mov.b64 {%r307, %r308}, %rd153;
mov.b64 %rd154, %fd301;
- mov.b64 {%r310, %r311}, %rd154;
-
- st.global.cs.v4.s32 [%rd151], {%r308,%r309,%r310,%r311};
+ mov.b64 {%r309, %r310}, %rd154;
+
+ st.global.cs.v4.s32 [%rd151], {%r307,%r308,%r309,%r310};
$L__BB0_153:
ret;
3: CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_576
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 72
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103395arrayE[];
.entry _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
.reg .b32 %r<613>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
ld.param.v2.u32 {%r107, %r108}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r117, %r118}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r121, %r122}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103399nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r143, %r108, 1;
shr.u32 %r144, %r143, 31;
add.s32 %r145, %r143, %r144;
shr.s32 %r2, %r145, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r146, %r2, %r3;
add.s32 %r147, %r146, 31;
shr.s32 %r148, %r147, 31;
shr.u32 %r149, %r148, 27;
add.s32 %r150, %r147, %r149;
shr.u32 %r151, %r150, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r152, %r4, %r151;
shl.b32 %r153, %r152, 8;
cvt.u64.u32 %rd1, %r153;
mul.lo.s32 %r154, %r4, %r2;
shl.b32 %r155, %r154, 4;
or.b32 %r156, %r155, 15;
and.b32 %r5, %r156, -16;
add.s32 %r157, %r156, %r5;
and.b32 %r158, %r157, -16;
cvt.s64.s32 %rd2, %r158;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_f3d3a0e4_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
cvt.rn.f64.s32 %fd1, %r108;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r159, %r7, 1;
setp.lt.s32 %p7, %r159, %r108;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r160, smem_ptr; }
// end inline asm
shl.b32 %r163, %r6, 4;
add.s32 %r161, %r160, %r163;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r162, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r162, 0;
cp.async.ca.shared.global [%r161], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r164, %r4, 215;
div.s32 %r165, %r164, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r166, %r9, %r165;
add.s32 %r167, %r166, -1;
div.s32 %r10, %r167, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r169, %ctaid.y;
mul.lo.s32 %r170, %r10, %r4;
mul.lo.s32 %r11, %r170, %r169;
shl.b32 %r171, %r8, 3;
shl.b32 %r172, %r6, 4;
mad.lo.s32 %r12, %r171, %r108, %r172;
mul.lo.s32 %r173, %r108, %r8;
cvt.s64.s32 %rd53, %r173;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r174, %r11, %r108;
cvt.s64.s32 %rd6, %r174;
mul.lo.s32 %r13, %r108, %r4;
mul.lo.s32 %r14, %r10, %r169;
add.s32 %r15, %r173, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r177, %r176, %r16;
shr.u32 %r17, %r6, 5;
add.s32 %r178, %r177, %r17;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r18, %r6, 31;
add.s32 %r179, %r177, %r18;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r176, 8;
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r601, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
// end inline asm
add.s32 %r183, %r12, %r182;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
// end inline asm
add.s32 %r193, %r12, %r192;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r180, %r601, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r185, %r13, %r601;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r184, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r184, 0;
cp.async.ca.shared.global [%r183], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r186, %r14, %r601;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd364, 0d0000000000000000;
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r188, %r14, %r601;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
add.s32 %r190, %r14, %r601;
mad.lo.s32 %r22, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
setp.gt.s32 %p16, %r22, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
mul.lo.s32 %r191, %r22, %r117;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
setp.lt.s32 %p17, %r22, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
mul.lo.s32 %r195, %r13, %r601;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r194, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r194, 0;
cp.async.ca.shared.global [%r193], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
mul.lo.s32 %r196, %r22, %r121;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd371, %fd370;
@%p18 bra $L__BB0_22;
ld.shared.v2.f64 {%fd118, %fd119}, [%rd7];
ld.shared.v2.f64 {%fd122, %fd123}, [%rd9];
ld.shared.v2.f64 {%fd126, %fd127}, [%rd11];
mul.f64 %fd130, %fd126, %fd118;
add.f64 %fd131, %fd130, 0d0000000000000000;
sub.f64 %fd132, %fd122, %fd366;
mul.f64 %fd133, %fd367, %fd132;
fma.rn.f64 %fd134, %fd130, %fd133, 0d0000000000000000;
fma.rn.f64 %fd368, %fd133, %fd118, %fd368;
mul.f64 %fd135, %fd127, %fd119;
add.f64 %fd371, %fd131, %fd135;
sub.f64 %fd136, %fd123, %fd366;
mul.f64 %fd137, %fd367, %fd136;
fma.rn.f64 %fd370, %fd135, %fd137, %fd134;
fma.rn.f64 %fd369, %fd137, %fd119, %fd369;
$L__BB0_22:
// begin inline asm
mov.b64 {%r197,%r198}, %fd371;
// end inline asm
mov.u32 %r217, 31;
mov.u32 %r218, 16;
mov.u32 %r219, -1;
shfl.sync.bfly.b32 %r200|%p21, %r198, %r218, %r217, %r219;
shfl.sync.bfly.b32 %r199|%p22, %r197, %r218, %r217, %r219;
// begin inline asm
mov.b64 %fd139, {%r199,%r200};
// end inline asm
add.f64 %fd140, %fd371, %fd139;
// begin inline asm
mov.b64 {%r201,%r202}, %fd140;
// end inline asm
mov.u32 %r220, 8;
shfl.sync.bfly.b32 %r204|%p23, %r202, %r220, %r217, %r219;
shfl.sync.bfly.b32 %r203|%p24, %r201, %r220, %r217, %r219;
// begin inline asm
mov.b64 %fd141, {%r203,%r204};
// end inline asm
add.f64 %fd142, %fd140, %fd141;
// begin inline asm
mov.b64 {%r205,%r206}, %fd142;
// end inline asm
mov.u32 %r221, 4;
shfl.sync.bfly.b32 %r208|%p25, %r206, %r221, %r217, %r219;
shfl.sync.bfly.b32 %r207|%p26, %r205, %r221, %r217, %r219;
// begin inline asm
mov.b64 %fd143, {%r207,%r208};
// end inline asm
add.f64 %fd144, %fd142, %fd143;
// begin inline asm
mov.b64 {%r209,%r210}, %fd144;
// end inline asm
mov.u32 %r222, 2;
shfl.sync.bfly.b32 %r212|%p27, %r210, %r222, %r217, %r219;
shfl.sync.bfly.b32 %r211|%p28, %r209, %r222, %r217, %r219;
// begin inline asm
mov.b64 %fd145, {%r211,%r212};
// end inline asm
add.f64 %fd146, %fd144, %fd145;
// begin inline asm
mov.b64 {%r213,%r214}, %fd146;
// end inline asm
mov.u32 %r223, 1;
shfl.sync.bfly.b32 %r216|%p29, %r214, %r223, %r217, %r219;
shfl.sync.bfly.b32 %r215|%p30, %r213, %r223, %r217, %r219;
// begin inline asm
mov.b64 %fd147, {%r215,%r216};
// end inline asm
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
setp.ne.s32 %p31, %r18, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
setp.ne.s32 %p32, %r17, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
setp.ge.u32 %p33, %r18, %r16;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
$L__BB0_27:
// begin inline asm
mov.b64 {%r224,%r225}, %fd372;
// end inline asm
mov.u32 %r244, 31;
mov.u32 %r245, 16;
mov.u32 %r246, -1;
shfl.sync.bfly.b32 %r227|%p34, %r225, %r245, %r244, %r246;
shfl.sync.bfly.b32 %r226|%p35, %r224, %r245, %r244, %r246;
// begin inline asm
mov.b64 %fd150, {%r226,%r227};
// end inline asm
add.f64 %fd151, %fd372, %fd150;
// begin inline asm
mov.b64 {%r228,%r229}, %fd151;
// end inline asm
mov.u32 %r247, 8;
shfl.sync.bfly.b32 %r231|%p36, %r229, %r247, %r244, %r246;
shfl.sync.bfly.b32 %r230|%p37, %r228, %r247, %r244, %r246;
// begin inline asm
mov.b64 %fd152, {%r230,%r231};
// end inline asm
add.f64 %fd153, %fd151, %fd152;
// begin inline asm
mov.b64 {%r232,%r233}, %fd153;
// end inline asm
mov.u32 %r248, 4;
shfl.sync.bfly.b32 %r235|%p38, %r233, %r248, %r244, %r246;
shfl.sync.bfly.b32 %r234|%p39, %r232, %r248, %r244, %r246;
// begin inline asm
mov.b64 %fd154, {%r234,%r235};
// end inline asm
add.f64 %fd155, %fd153, %fd154;
// begin inline asm
mov.b64 {%r236,%r237}, %fd155;
// end inline asm
mov.u32 %r249, 2;
shfl.sync.bfly.b32 %r239|%p40, %r237, %r249, %r244, %r246;
shfl.sync.bfly.b32 %r238|%p41, %r236, %r249, %r244, %r246;
// begin inline asm
mov.b64 %fd156, {%r238,%r239};
// end inline asm
add.f64 %fd157, %fd155, %fd156;
// begin inline asm
mov.b64 {%r240,%r241}, %fd157;
// end inline asm
mov.u32 %r250, 1;
shfl.sync.bfly.b32 %r243|%p42, %r241, %r250, %r244, %r246;
shfl.sync.bfly.b32 %r242|%p43, %r240, %r250, %r244, %r246;
// begin inline asm
mov.b64 %fd158, {%r242,%r243};
// end inline asm
add.f64 %fd373, %fd157, %fd158;
$L__BB0_28:
bar.sync 0;
// begin inline asm
mov.b64 {%r251,%r252}, %fd370;
// end inline asm
mov.u32 %r271, 31;
mov.u32 %r272, 16;
mov.u32 %r273, -1;
shfl.sync.bfly.b32 %r254|%p44, %r252, %r272, %r271, %r273;
shfl.sync.bfly.b32 %r253|%p45, %r251, %r272, %r271, %r273;
// begin inline asm
mov.b64 %fd160, {%r253,%r254};
// end inline asm
add.f64 %fd161, %fd370, %fd160;
// begin inline asm
mov.b64 {%r255,%r256}, %fd161;
// end inline asm
mov.u32 %r274, 8;
shfl.sync.bfly.b32 %r258|%p46, %r256, %r274, %r271, %r273;
shfl.sync.bfly.b32 %r257|%p47, %r255, %r274, %r271, %r273;
// begin inline asm
mov.b64 %fd162, {%r257,%r258};
// end inline asm
add.f64 %fd163, %fd161, %fd162;
// begin inline asm
mov.b64 {%r259,%r260}, %fd163;
// end inline asm
mov.u32 %r275, 4;
shfl.sync.bfly.b32 %r262|%p48, %r260, %r275, %r271, %r273;
shfl.sync.bfly.b32 %r261|%p49, %r259, %r275, %r271, %r273;
// begin inline asm
mov.b64 %fd164, {%r261,%r262};
// end inline asm
add.f64 %fd165, %fd163, %fd164;
// begin inline asm
mov.b64 {%r263,%r264}, %fd165;
// end inline asm
mov.u32 %r276, 2;
shfl.sync.bfly.b32 %r266|%p50, %r264, %r276, %r271, %r273;
shfl.sync.bfly.b32 %r265|%p51, %r263, %r276, %r271, %r273;
// begin inline asm
mov.b64 %fd166, {%r265,%r266};
// end inline asm
add.f64 %fd167, %fd165, %fd166;
// begin inline asm
mov.b64 {%r267,%r268}, %fd167;
// end inline asm
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r270|%p52, %r268, %r277, %r271, %r273;
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
// begin inline asm
mov.b64 %fd168, {%r269,%r270};
// end inline asm
add.f64 %fd375, %fd167, %fd168;
setp.eq.s32 %p4, %r18, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
$L__BB0_30:
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
setp.ge.u32 %p56, %r18, %r16;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
$L__BB0_33:
// begin inline asm
mov.b64 {%r278,%r279}, %fd374;
// end inline asm
mov.u32 %r298, 31;
mov.u32 %r299, 16;
mov.u32 %r300, -1;
shfl.sync.bfly.b32 %r281|%p57, %r279, %r299, %r298, %r300;
shfl.sync.bfly.b32 %r280|%p58, %r278, %r299, %r298, %r300;
// begin inline asm
mov.b64 %fd172, {%r280,%r281};
// end inline asm
add.f64 %fd173, %fd374, %fd172;
// begin inline asm
mov.b64 {%r282,%r283}, %fd173;
// end inline asm
mov.u32 %r301, 8;
shfl.sync.bfly.b32 %r285|%p59, %r283, %r301, %r298, %r300;
shfl.sync.bfly.b32 %r284|%p60, %r282, %r301, %r298, %r300;
// begin inline asm
mov.b64 %fd174, {%r284,%r285};
// end inline asm
add.f64 %fd175, %fd173, %fd174;
// begin inline asm
mov.b64 {%r286,%r287}, %fd175;
// end inline asm
mov.u32 %r302, 4;
shfl.sync.bfly.b32 %r289|%p61, %r287, %r302, %r298, %r300;
shfl.sync.bfly.b32 %r288|%p62, %r286, %r302, %r298, %r300;
// begin inline asm
mov.b64 %fd176, {%r288,%r289};
// end inline asm
add.f64 %fd177, %fd175, %fd176;
// begin inline asm
mov.b64 {%r290,%r291}, %fd177;
// end inline asm
mov.u32 %r303, 2;
shfl.sync.bfly.b32 %r293|%p63, %r291, %r303, %r298, %r300;
shfl.sync.bfly.b32 %r292|%p64, %r290, %r303, %r298, %r300;
// begin inline asm
mov.b64 %fd178, {%r292,%r293};
// end inline asm
add.f64 %fd179, %fd177, %fd178;
// begin inline asm
mov.b64 {%r294,%r295}, %fd179;
// end inline asm
mov.u32 %r304, 1;
shfl.sync.bfly.b32 %r297|%p65, %r295, %r304, %r298, %r300;
shfl.sync.bfly.b32 %r296|%p66, %r294, %r304, %r298, %r300;
// begin inline asm
mov.b64 %fd180, {%r296,%r297};
// end inline asm
add.f64 %fd375, %fd179, %fd180;
$L__BB0_34:
bar.sync 0;
setp.ne.s32 %p67, %r6, 0;
@%p67 bra $L__BB0_36;
st.shared.f64 [%rd12], %fd33;
$L__BB0_36:
bar.sync 0;
ld.shared.f64 %fd38, [%rd12];
bar.sync 0;
@%p67 bra $L__BB0_38;
add.f64 %fd181, %fd375, 0d0000000000000000;
selp.f64 %fd182, %fd181, 0d0000000000000000, %p4;
st.shared.f64 [%rd12], %fd182;
$L__BB0_38:
bar.sync 0;
ld.shared.f64 %fd39, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_40;
mul.f64 %fd183, %fd2, %fd367;
ld.shared.v2.f64 {%fd184, %fd185}, [%rd9];
ld.shared.v2.f64 {%fd188, %fd189}, [%rd11];
ld.shared.v2.f64 {%fd192, %fd193}, [%rd7];
mul.f64 %fd196, %fd188, %fd192;
mul.f64 %fd197, %fd196, %fd1;
sub.f64 %fd198, %fd184, %fd366;
mul.f64 %fd199, %fd367, %fd198;
sub.f64 %fd200, %fd197, %fd38;
mul.f64 %fd201, %fd39, %fd199;
sub.f64 %fd202, %fd200, %fd201;
mul.f64 %fd203, %fd183, %fd202;
mov.b64 %rd80, %fd203;
mul.f64 %fd204, %fd189, %fd193;
mul.f64 %fd205, %fd204, %fd1;
sub.f64 %fd206, %fd185, %fd366;
mul.f64 %fd207, %fd367, %fd206;
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
mad.lo.s32 %r309, %r601, %r4, %r11;
mad.lo.s32 %r310, %r309, %r108, %r15;
mul.wide.s32 %rd82, %r310, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
// end inline asm
$L__BB0_40:
add.s32 %r601, %r601, 1;
setp.lt.s32 %p71, %r601, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
mov.u32 %r311, %tid.z;
mad.lo.s32 %r24, %r311, %r4, %r8;
mad.lo.s32 %r25, %r24, %r3, %r6;
mul.wide.u32 %rd83, %r25, 8;
add.s64 %rd22, %rd44, %rd83;
clz.b32 %r312, %r4;
mov.u32 %r313, 31;
sub.s32 %r314, %r313, %r312;
mov.u32 %r315, 1;
shl.b32 %r26, %r315, %r314;
setp.lt.u32 %p72, %r8, %r26;
add.s32 %r316, %r26, %r8;
setp.lt.u32 %p73, %r316, %r4;
and.pred %p5, %p72, %p73;
shl.b32 %r317, %r3, %r314;
add.s32 %r318, %r25, %r317;
mul.wide.s32 %rd85, %r318, 8;
add.s64 %rd23, %rd44, %rd85;
shr.u32 %r319, %r26, 31;
add.s32 %r320, %r26, %r319;
shr.s32 %r605, %r320, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
ld.shared.f64 %fd212, [%rd23];
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
setp.lt.s32 %p75, %r26, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
mov.u32 %r602, %r605;
$L__BB0_45:
setp.ge.u32 %p76, %r8, %r602;
@%p76 bra $L__BB0_47;
mad.lo.s32 %r321, %r602, %r3, %r25;
mul.wide.s32 %rd86, %r321, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
shr.u32 %r29, %r602, 1;
setp.gt.u32 %p77, %r602, 3;
mov.u32 %r602, %r29;
@%p77 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r322, %r25, %r3;
mul.wide.u32 %rd89, %r322, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f64 %fd219, [%rd22];
add.f64 %fd380, %fd219, 0d0000000000000000;
@%p79 bra $L__BB0_51;
ld.shared.f64 %fd220, [%rd24];
add.f64 %fd380, %fd380, %fd220;
$L__BB0_51:
bar.sync 0;
st.shared.f64 [%rd22], %fd369;
bar.sync 0;
@%p74 bra $L__BB0_53;
ld.shared.f64 %fd221, [%rd23];
ld.shared.f64 %fd222, [%rd22];
add.f64 %fd223, %fd221, %fd222;
st.shared.f64 [%rd22], %fd223;
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
mov.u32 %r603, %r605;
$L__BB0_55:
setp.ge.u32 %p82, %r8, %r603;
@%p82 bra $L__BB0_57;
mad.lo.s32 %r323, %r603, %r3, %r25;
mul.wide.s32 %rd91, %r323, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
shr.u32 %r31, %r603, 1;
setp.gt.u32 %p83, %r603, 3;
mov.u32 %r603, %r31;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f64 %fd228, [%rd22];
add.f64 %fd381, %fd228, 0d0000000000000000;
@%p85 bra $L__BB0_61;
ld.shared.f64 %fd229, [%rd24];
add.f64 %fd381, %fd381, %fd229;
$L__BB0_61:
bar.sync 0;
st.shared.f64 [%rd22], %fd378;
bar.sync 0;
@%p74 bra $L__BB0_63;
ld.shared.f64 %fd230, [%rd23];
ld.shared.f64 %fd231, [%rd22];
add.f64 %fd232, %fd230, %fd231;
st.shared.f64 [%rd22], %fd232;
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
mov.u32 %r604, %r605;
$L__BB0_65:
setp.ge.u32 %p88, %r8, %r604;
@%p88 bra $L__BB0_67;
mad.lo.s32 %r324, %r604, %r3, %r25;
mul.wide.s32 %rd94, %r324, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
shr.u32 %r33, %r604, 1;
setp.gt.u32 %p89, %r604, 3;
mov.u32 %r604, %r33;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f64 %fd237, [%rd22];
add.f64 %fd382, %fd237, 0d0000000000000000;
@%p91 bra $L__BB0_71;
ld.shared.f64 %fd238, [%rd24];
add.f64 %fd382, %fd382, %fd238;
$L__BB0_71:
bar.sync 0;
st.shared.f64 [%rd22], %fd379;
bar.sync 0;
@%p74 bra $L__BB0_73;
ld.shared.f64 %fd239, [%rd23];
ld.shared.f64 %fd240, [%rd22];
add.f64 %fd241, %fd239, %fd240;
st.shared.f64 [%rd22], %fd241;
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
setp.ge.u32 %p94, %r8, %r605;
@%p94 bra $L__BB0_76;
mad.lo.s32 %r325, %r605, %r3, %r25;
mul.wide.s32 %rd97, %r325, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
shr.u32 %r35, %r605, 1;
setp.gt.u32 %p95, %r605, 3;
mov.u32 %r605, %r35;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
setp.lt.u32 %p97, %r4, 2;
ld.shared.f64 %fd246, [%rd22];
add.f64 %fd383, %fd246, 0d0000000000000000;
@%p97 bra $L__BB0_80;
ld.shared.f64 %fd247, [%rd24];
add.f64 %fd383, %fd383, %fd247;
$L__BB0_80:
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
mov.u32 %r334, %ctaid.y;
mad.lo.s32 %r335, %r108, %r334, %r7;
mul.wide.s32 %rd102, %r335, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
mov.b64 {%r326, %r327}, %rd103;
mov.b64 %rd104, %fd381;
mov.b64 {%r328, %r329}, %rd104;
// begin inline asm
st.volatile.global.v4.s32 [%rd100], {%r326,%r327,%r328,%r329};
// end inline asm
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
mov.b64 {%r330, %r331}, %rd105;
mov.b64 %rd106, %fd383;
mov.b64 {%r332, %r333}, %rd106;
// begin inline asm
st.volatile.global.v4.s32 [%rd101], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_82:
mov.u32 %r36, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r336, %r6, %r8;
or.b32 %r338, %r336, %r311;
setp.ne.s32 %p98, %r338, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
mov.u32 %r339, %ctaid.x;
mov.u32 %r340, %ctaid.z;
mov.u32 %r341, %nctaid.x;
mad.lo.s32 %r342, %r340, %r341, %r339;
mul.wide.s32 %rd108, %r342, 8;
add.s64 %rd27, %rd107, %rd108;
add.s32 %r343, %r9, -1;
setp.eq.s32 %p99, %r36, %r343;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
mov.u32 %r606, 8;
$L__BB0_85:
// begin inline asm
nanosleep.u32 %r606;
// end inline asm
setp.lt.u32 %p101, %r606, 256;
selp.u32 %r346, 1, 0, %p101;
shl.b32 %r606, %r606, %r346;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
add.s32 %r347, %r4, %r2;
add.s32 %r348, %r347, -1;
div.s32 %r349, %r348, %r4;
add.s32 %r350, %r9, %r349;
add.s32 %r351, %r350, -1;
div.s32 %r39, %r351, %r9;
setp.lt.s32 %p103, %r39, 1;
@%p103 bra $L__BB0_133;
add.s32 %r353, %r9, %r3;
add.s32 %r354, %r353, -1;
shl.b32 %r40, %r8, 1;
shl.b32 %r355, %r4, 1;
mad.lo.s32 %r43, %r355, %r36, %r40;
or.b32 %r41, %r43, 1;
mul.lo.s32 %r42, %r355, %r9;
shr.u32 %r44, %r3, 5;
mul.lo.s32 %r356, %r24, %r44;
shr.u32 %r45, %r6, 5;
add.s32 %r357, %r356, %r45;
mul.wide.u32 %rd117, %r357, 8;
add.s64 %rd29, %rd44, %rd117;
and.b32 %r46, %r6, 31;
add.s32 %r358, %r356, %r46;
mul.wide.u32 %rd119, %r358, 8;
add.s64 %rd30, %rd44, %rd119;
div.s32 %r47, %r354, %r3;
mov.u32 %r607, 0;
$L__BB0_88:
.pragma "nounroll";
setp.lt.s32 %p104, %r47, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
mul.lo.s32 %r360, %r42, %r607;
add.s32 %r49, %r41, %r360;
add.s32 %r50, %r43, %r360;
mov.u32 %r608, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
setp.ge.s32 %p105, %r49, %r108;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
mad.lo.s32 %r52, %r608, %r3, %r6;
setp.ge.s32 %p106, %r52, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
mad.lo.s32 %r365, %r52, %r108, %r50;
mul.wide.s32 %rd121, %r365, 8;
add.s64 %rd120, %rd42, %rd121;
// begin inline asm
ld.volatile.global.v4.s32 {%r361,%r362,%r363,%r364}, [%rd120];
// end inline asm
mov.b64 %rd122, {%r361, %r362};
mov.b64 %fd387, %rd122;
mov.b64 %rd123, {%r363, %r364};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
add.s32 %r608, %r608, 1;
setp.lt.s32 %p107, %r608, %r47;
@%p107 bra $L__BB0_90;
$L__BB0_94:
// begin inline asm
mov.b64 {%r366,%r367}, %fd389;
// end inline asm
mov.u32 %r386, 31;
mov.u32 %r387, 16;
mov.u32 %r388, -1;
shfl.sync.bfly.b32 %r369|%p108, %r367, %r387, %r386, %r388;
shfl.sync.bfly.b32 %r368|%p109, %r366, %r387, %r386, %r388;
// begin inline asm
mov.b64 %fd257, {%r368,%r369};
// end inline asm
add.f64 %fd258, %fd389, %fd257;
// begin inline asm
mov.b64 {%r370,%r371}, %fd258;
// end inline asm
mov.u32 %r389, 8;
shfl.sync.bfly.b32 %r373|%p110, %r371, %r389, %r386, %r388;
shfl.sync.bfly.b32 %r372|%p111, %r370, %r389, %r386, %r388;
// begin inline asm
mov.b64 %fd259, {%r372,%r373};
// end inline asm
add.f64 %fd260, %fd258, %fd259;
// begin inline asm
mov.b64 {%r374,%r375}, %fd260;
// end inline asm
mov.u32 %r390, 4;
shfl.sync.bfly.b32 %r377|%p112, %r375, %r390, %r386, %r388;
shfl.sync.bfly.b32 %r376|%p113, %r374, %r390, %r386, %r388;
// begin inline asm
mov.b64 %fd261, {%r376,%r377};
// end inline asm
add.f64 %fd262, %fd260, %fd261;
// begin inline asm
mov.b64 {%r378,%r379}, %fd262;
// end inline asm
mov.u32 %r391, 2;
shfl.sync.bfly.b32 %r381|%p114, %r379, %r391, %r386, %r388;
shfl.sync.bfly.b32 %r380|%p115, %r378, %r391, %r386, %r388;
// begin inline asm
mov.b64 %fd263, {%r380,%r381};
// end inline asm
add.f64 %fd264, %fd262, %fd263;
// begin inline asm
mov.b64 {%r382,%r383}, %fd264;
// end inline asm
mov.u32 %r392, 1;
shfl.sync.bfly.b32 %r385|%p116, %r383, %r392, %r386, %r388;
shfl.sync.bfly.b32 %r384|%p117, %r382, %r392, %r386, %r388;
// begin inline asm
mov.b64 %fd265, {%r384,%r385};
// end inline asm
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
setp.ne.s32 %p118, %r46, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
setp.ne.s32 %p119, %r45, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
setp.ge.u32 %p120, %r46, %r44;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
// begin inline asm
mov.b64 {%r393,%r394}, %fd390;
// end inline asm
mov.u32 %r413, 31;
mov.u32 %r414, 16;
mov.u32 %r415, -1;
shfl.sync.bfly.b32 %r396|%p121, %r394, %r414, %r413, %r415;
shfl.sync.bfly.b32 %r395|%p122, %r393, %r414, %r413, %r415;
// begin inline asm
mov.b64 %fd268, {%r395,%r396};
// end inline asm
add.f64 %fd269, %fd390, %fd268;
// begin inline asm
mov.b64 {%r397,%r398}, %fd269;
// end inline asm
mov.u32 %r416, 8;
shfl.sync.bfly.b32 %r400|%p123, %r398, %r416, %r413, %r415;
shfl.sync.bfly.b32 %r399|%p124, %r397, %r416, %r413, %r415;
// begin inline asm
mov.b64 %fd270, {%r399,%r400};
// end inline asm
add.f64 %fd271, %fd269, %fd270;
// begin inline asm
mov.b64 {%r401,%r402}, %fd271;
// end inline asm
mov.u32 %r417, 4;
shfl.sync.bfly.b32 %r404|%p125, %r402, %r417, %r413, %r415;
shfl.sync.bfly.b32 %r403|%p126, %r401, %r417, %r413, %r415;
// begin inline asm
mov.b64 %fd272, {%r403,%r404};
// end inline asm
add.f64 %fd273, %fd271, %fd272;
// begin inline asm
mov.b64 {%r405,%r406}, %fd273;
// end inline asm
mov.u32 %r418, 2;
shfl.sync.bfly.b32 %r408|%p127, %r406, %r418, %r413, %r415;
shfl.sync.bfly.b32 %r407|%p128, %r405, %r418, %r413, %r415;
// begin inline asm
mov.b64 %fd274, {%r407,%r408};
// end inline asm
add.f64 %fd275, %fd273, %fd274;
// begin inline asm
mov.b64 {%r409,%r410}, %fd275;
// end inline asm
mov.u32 %r419, 1;
shfl.sync.bfly.b32 %r412|%p129, %r410, %r419, %r413, %r415;
shfl.sync.bfly.b32 %r411|%p130, %r409, %r419, %r413, %r415;
// begin inline asm
mov.b64 %fd276, {%r411,%r412};
// end inline asm
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
setp.eq.s32 %p132, %r46, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r420,%r421}, %fd388;
// end inline asm
mov.u32 %r440, 31;
mov.u32 %r441, 16;
mov.u32 %r442, -1;
shfl.sync.bfly.b32 %r423|%p133, %r421, %r441, %r440, %r442;
shfl.sync.bfly.b32 %r422|%p134, %r420, %r441, %r440, %r442;
// begin inline asm
mov.b64 %fd278, {%r422,%r423};
// end inline asm
add.f64 %fd279, %fd388, %fd278;
// begin inline asm
mov.b64 {%r424,%r425}, %fd279;
// end inline asm
mov.u32 %r443, 8;
shfl.sync.bfly.b32 %r427|%p135, %r425, %r443, %r440, %r442;
shfl.sync.bfly.b32 %r426|%p136, %r424, %r443, %r440, %r442;
// begin inline asm
mov.b64 %fd280, {%r426,%r427};
// end inline asm
add.f64 %fd281, %fd279, %fd280;
// begin inline asm
mov.b64 {%r428,%r429}, %fd281;
// end inline asm
mov.u32 %r444, 4;
shfl.sync.bfly.b32 %r431|%p137, %r429, %r444, %r440, %r442;
shfl.sync.bfly.b32 %r430|%p138, %r428, %r444, %r440, %r442;
// begin inline asm
mov.b64 %fd282, {%r430,%r431};
// end inline asm
add.f64 %fd283, %fd281, %fd282;
// begin inline asm
mov.b64 {%r432,%r433}, %fd283;
// end inline asm
mov.u32 %r445, 2;
shfl.sync.bfly.b32 %r435|%p139, %r433, %r445, %r440, %r442;
shfl.sync.bfly.b32 %r434|%p140, %r432, %r445, %r440, %r442;
// begin inline asm
mov.b64 %fd284, {%r434,%r435};
// end inline asm
add.f64 %fd285, %fd283, %fd284;
// begin inline asm
mov.b64 {%r436,%r437}, %fd285;
// end inline asm
mov.u32 %r446, 1;
shfl.sync.bfly.b32 %r439|%p141, %r437, %r446, %r440, %r442;
shfl.sync.bfly.b32 %r438|%p142, %r436, %r446, %r440, %r442;
// begin inline asm
mov.b64 %fd286, {%r438,%r439};
// end inline asm
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
st.shared.f64 [%rd29], %fd393;
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
setp.ge.u32 %p144, %r46, %r44;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
// begin inline asm
mov.b64 {%r447,%r448}, %fd392;
// end inline asm
mov.u32 %r467, 31;
mov.u32 %r468, 16;
mov.u32 %r469, -1;
shfl.sync.bfly.b32 %r450|%p145, %r448, %r468, %r467, %r469;
shfl.sync.bfly.b32 %r449|%p146, %r447, %r468, %r467, %r469;
// begin inline asm
mov.b64 %fd290, {%r449,%r450};
// end inline asm
add.f64 %fd291, %fd392, %fd290;
// begin inline asm
mov.b64 {%r451,%r452}, %fd291;
// end inline asm
mov.u32 %r470, 8;
shfl.sync.bfly.b32 %r454|%p147, %r452, %r470, %r467, %r469;
shfl.sync.bfly.b32 %r453|%p148, %r451, %r470, %r467, %r469;
// begin inline asm
mov.b64 %fd292, {%r453,%r454};
// end inline asm
add.f64 %fd293, %fd291, %fd292;
// begin inline asm
mov.b64 {%r455,%r456}, %fd293;
// end inline asm
mov.u32 %r471, 4;
shfl.sync.bfly.b32 %r458|%p149, %r456, %r471, %r467, %r469;
shfl.sync.bfly.b32 %r457|%p150, %r455, %r471, %r467, %r469;
// begin inline asm
mov.b64 %fd294, {%r457,%r458};
// end inline asm
add.f64 %fd295, %fd293, %fd294;
// begin inline asm
mov.b64 {%r459,%r460}, %fd295;
// end inline asm
mov.u32 %r472, 2;
shfl.sync.bfly.b32 %r462|%p151, %r460, %r472, %r467, %r469;
shfl.sync.bfly.b32 %r461|%p152, %r459, %r472, %r467, %r469;
// begin inline asm
mov.b64 %fd296, {%r461,%r462};
// end inline asm
add.f64 %fd297, %fd295, %fd296;
// begin inline asm
mov.b64 {%r463,%r464}, %fd297;
// end inline asm
mov.u32 %r473, 1;
shfl.sync.bfly.b32 %r466|%p153, %r464, %r473, %r467, %r469;
shfl.sync.bfly.b32 %r465|%p154, %r463, %r473, %r467, %r469;
// begin inline asm
mov.b64 %fd298, {%r465,%r466};
// end inline asm
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
mul.lo.s32 %r54, %r42, %r607;
add.s32 %r474, %r41, %r54;
setp.ge.s32 %p156, %r474, %r108;
@%p156 bra $L__BB0_109;
add.s32 %r479, %r43, %r54;
mul.wide.s32 %rd125, %r479, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
mov.b64 {%r475, %r476}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
mov.b64 {%r477, %r478}, %rd127;
// begin inline asm
st.global.cs.v4.s32 [%rd124], {%r475,%r476,%r477,%r478};
// end inline asm
$L__BB0_109:
add.s32 %r607, %r607, 1;
setp.lt.s32 %p158, %r607, %r39;
@%p158 bra $L__BB0_88;
mad.lo.s32 %r56, %r108, %r6, %r40;
shl.b32 %r57, %r36, 1;
shl.b32 %r58, %r9, 1;
mul.lo.s32 %r59, %r108, %r3;
mov.u32 %r609, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
mad.lo.s32 %r61, %r42, %r609, %r41;
mad.lo.s32 %r482, %r58, %r609, %r57;
mad.lo.s32 %r611, %r4, %r482, %r56;
mov.u32 %r612, 0;
mov.f64 %fd304, 0d0000000000000000;
mov.u32 %r610, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
setp.ge.s32 %p160, %r61, %r108;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
setp.ge.s32 %p161, %r610, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
mul.wide.s32 %rd129, %r611, 8;
add.s64 %rd128, %rd41, %rd129;
// begin inline asm
ld.volatile.global.v4.s32 {%r483,%r484,%r485,%r486}, [%rd128];
// end inline asm
mov.b64 %rd130, {%r483, %r484};
mov.b64 %fd397, %rd130;
mov.b64 %rd131, {%r485, %r486};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
add.s32 %r611, %r611, %r59;
add.s32 %r610, %r610, %r3;
add.s32 %r612, %r612, 1;
setp.lt.s32 %p162, %r612, %r47;
@%p162 bra $L__BB0_113;
$L__BB0_117:
// begin inline asm
mov.b64 {%r487,%r488}, %fd399;
// end inline asm
mov.u32 %r507, 31;
mov.u32 %r508, 16;
mov.u32 %r509, -1;
shfl.sync.bfly.b32 %r490|%p163, %r488, %r508, %r507, %r509;
shfl.sync.bfly.b32 %r489|%p164, %r487, %r508, %r507, %r509;
// begin inline asm
mov.b64 %fd310, {%r489,%r490};
// end inline asm
add.f64 %fd311, %fd399, %fd310;
// begin inline asm
mov.b64 {%r491,%r492}, %fd311;
// end inline asm
mov.u32 %r510, 8;
shfl.sync.bfly.b32 %r494|%p165, %r492, %r510, %r507, %r509;
shfl.sync.bfly.b32 %r493|%p166, %r491, %r510, %r507, %r509;
// begin inline asm
mov.b64 %fd312, {%r493,%r494};
// end inline asm
add.f64 %fd313, %fd311, %fd312;
// begin inline asm
mov.b64 {%r495,%r496}, %fd313;
// end inline asm
mov.u32 %r511, 4;
shfl.sync.bfly.b32 %r498|%p167, %r496, %r511, %r507, %r509;
shfl.sync.bfly.b32 %r497|%p168, %r495, %r511, %r507, %r509;
// begin inline asm
mov.b64 %fd314, {%r497,%r498};
// end inline asm
add.f64 %fd315, %fd313, %fd314;
// begin inline asm
mov.b64 {%r499,%r500}, %fd315;
// end inline asm
mov.u32 %r512, 2;
shfl.sync.bfly.b32 %r502|%p169, %r500, %r512, %r507, %r509;
shfl.sync.bfly.b32 %r501|%p170, %r499, %r512, %r507, %r509;
// begin inline asm
mov.b64 %fd316, {%r501,%r502};
// end inline asm
add.f64 %fd317, %fd315, %fd316;
// begin inline asm
mov.b64 {%r503,%r504}, %fd317;
// end inline asm
mov.u32 %r513, 1;
shfl.sync.bfly.b32 %r506|%p171, %r504, %r513, %r507, %r509;
shfl.sync.bfly.b32 %r505|%p172, %r503, %r513, %r507, %r509;
// begin inline asm
mov.b64 %fd318, {%r505,%r506};
// end inline asm
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
st.shared.f64 [%rd29], %fd401;
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
setp.ge.u32 %p175, %r46, %r44;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
// begin inline asm
mov.b64 {%r514,%r515}, %fd400;
// end inline asm
mov.u32 %r534, 31;
mov.u32 %r535, 16;
mov.u32 %r536, -1;
shfl.sync.bfly.b32 %r517|%p176, %r515, %r535, %r534, %r536;
shfl.sync.bfly.b32 %r516|%p177, %r514, %r535, %r534, %r536;
// begin inline asm
mov.b64 %fd321, {%r516,%r517};
// end inline asm
add.f64 %fd322, %fd400, %fd321;
// begin inline asm
mov.b64 {%r518,%r519}, %fd322;
// end inline asm
mov.u32 %r537, 8;
shfl.sync.bfly.b32 %r521|%p178, %r519, %r537, %r534, %r536;
shfl.sync.bfly.b32 %r520|%p179, %r518, %r537, %r534, %r536;
// begin inline asm
mov.b64 %fd323, {%r520,%r521};
// end inline asm
add.f64 %fd324, %fd322, %fd323;
// begin inline asm
mov.b64 {%r522,%r523}, %fd324;
// end inline asm
mov.u32 %r538, 4;
shfl.sync.bfly.b32 %r525|%p180, %r523, %r538, %r534, %r536;
shfl.sync.bfly.b32 %r524|%p181, %r522, %r538, %r534, %r536;
// begin inline asm
mov.b64 %fd325, {%r524,%r525};
// end inline asm
add.f64 %fd326, %fd324, %fd325;
// begin inline asm
mov.b64 {%r526,%r527}, %fd326;
// end inline asm
mov.u32 %r539, 2;
shfl.sync.bfly.b32 %r529|%p182, %r527, %r539, %r534, %r536;
shfl.sync.bfly.b32 %r528|%p183, %r526, %r539, %r534, %r536;
// begin inline asm
mov.b64 %fd327, {%r528,%r529};
// end inline asm
add.f64 %fd328, %fd326, %fd327;
// begin inline asm
mov.b64 {%r530,%r531}, %fd328;
// end inline asm
mov.u32 %r540, 1;
shfl.sync.bfly.b32 %r533|%p184, %r531, %r540, %r534, %r536;
shfl.sync.bfly.b32 %r532|%p185, %r530, %r540, %r534, %r536;
// begin inline asm
mov.b64 %fd329, {%r532,%r533};
// end inline asm
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r541,%r542}, %fd398;
// end inline asm
mov.u32 %r561, 31;
mov.u32 %r562, 16;
mov.u32 %r563, -1;
shfl.sync.bfly.b32 %r544|%p188, %r542, %r562, %r561, %r563;
shfl.sync.bfly.b32 %r543|%p189, %r541, %r562, %r561, %r563;
// begin inline asm
mov.b64 %fd331, {%r543,%r544};
// end inline asm
add.f64 %fd332, %fd398, %fd331;
// begin inline asm
mov.b64 {%r545,%r546}, %fd332;
// end inline asm
mov.u32 %r564, 8;
shfl.sync.bfly.b32 %r548|%p190, %r546, %r564, %r561, %r563;
shfl.sync.bfly.b32 %r547|%p191, %r545, %r564, %r561, %r563;
// begin inline asm
mov.b64 %fd333, {%r547,%r548};
// end inline asm
add.f64 %fd334, %fd332, %fd333;
// begin inline asm
mov.b64 {%r549,%r550}, %fd334;
// end inline asm
mov.u32 %r565, 4;
shfl.sync.bfly.b32 %r552|%p192, %r550, %r565, %r561, %r563;
shfl.sync.bfly.b32 %r551|%p193, %r549, %r565, %r561, %r563;
// begin inline asm
mov.b64 %fd335, {%r551,%r552};
// end inline asm
add.f64 %fd336, %fd334, %fd335;
// begin inline asm
mov.b64 {%r553,%r554}, %fd336;
// end inline asm
mov.u32 %r566, 2;
shfl.sync.bfly.b32 %r556|%p194, %r554, %r566, %r561, %r563;
shfl.sync.bfly.b32 %r555|%p195, %r553, %r566, %r561, %r563;
// begin inline asm
mov.b64 %fd337, {%r555,%r556};
// end inline asm
add.f64 %fd338, %fd336, %fd337;
// begin inline asm
mov.b64 {%r557,%r558}, %fd338;
// end inline asm
mov.u32 %r567, 1;
shfl.sync.bfly.b32 %r560|%p196, %r558, %r567, %r561, %r563;
shfl.sync.bfly.b32 %r559|%p197, %r557, %r567, %r561, %r563;
// begin inline asm
mov.b64 %fd339, {%r559,%r560};
// end inline asm
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
st.shared.f64 [%rd29], %fd403;
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
setp.ge.u32 %p199, %r46, %r44;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
// begin inline asm
mov.b64 {%r568,%r569}, %fd402;
// end inline asm
mov.u32 %r588, 31;
mov.u32 %r589, 16;
mov.u32 %r590, -1;
shfl.sync.bfly.b32 %r571|%p200, %r569, %r589, %r588, %r590;
shfl.sync.bfly.b32 %r570|%p201, %r568, %r589, %r588, %r590;
// begin inline asm
mov.b64 %fd343, {%r570,%r571};
// end inline asm
add.f64 %fd344, %fd402, %fd343;
// begin inline asm
mov.b64 {%r572,%r573}, %fd344;
// end inline asm
mov.u32 %r591, 8;
shfl.sync.bfly.b32 %r575|%p202, %r573, %r591, %r588, %r590;
shfl.sync.bfly.b32 %r574|%p203, %r572, %r591, %r588, %r590;
// begin inline asm
mov.b64 %fd345, {%r574,%r575};
// end inline asm
add.f64 %fd346, %fd344, %fd345;
// begin inline asm
mov.b64 {%r576,%r577}, %fd346;
// end inline asm
mov.u32 %r592, 4;
shfl.sync.bfly.b32 %r579|%p204, %r577, %r592, %r588, %r590;
shfl.sync.bfly.b32 %r578|%p205, %r576, %r592, %r588, %r590;
// begin inline asm
mov.b64 %fd347, {%r578,%r579};
// end inline asm
add.f64 %fd348, %fd346, %fd347;
// begin inline asm
mov.b64 {%r580,%r581}, %fd348;
// end inline asm
mov.u32 %r593, 2;
shfl.sync.bfly.b32 %r583|%p206, %r581, %r593, %r588, %r590;
shfl.sync.bfly.b32 %r582|%p207, %r580, %r593, %r588, %r590;
// begin inline asm
mov.b64 %fd349, {%r582,%r583};
// end inline asm
add.f64 %fd350, %fd348, %fd349;
// begin inline asm
mov.b64 {%r584,%r585}, %fd350;
// end inline asm
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r587|%p208, %r585, %r594, %r588, %r590;
shfl.sync.bfly.b32 %r586|%p209, %r584, %r594, %r588, %r590;
// begin inline asm
mov.b64 %fd351, {%r586,%r587};
// end inline asm
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
mul.lo.s32 %r69, %r42, %r609;
add.s32 %r595, %r41, %r69;
setp.ge.s32 %p211, %r595, %r108;
@%p211 bra $L__BB0_132;
add.s32 %r600, %r43, %r69;
mul.wide.s32 %rd133, %r600, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
mov.b64 {%r596, %r597}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
mov.b64 {%r598, %r599}, %rd135;
// begin inline asm
st.global.cs.v4.s32 [%rd132], {%r596,%r597,%r598,%r599};
// end inline asm
$L__BB0_132:
add.s32 %r609, %r609, 1;
setp.lt.s32 %p213, %r609, %r39;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72335arrayE[];
.entry _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
.reg .b32 %r<612>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
ld.param.v2.u32 {%r106, %r107}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r116, %r117}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r120, %r121}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd38, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72339nvfuser_4ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r142, %r107, 1;
shr.u32 %r143, %r142, 31;
add.s32 %r144, %r142, %r143;
shr.s32 %r2, %r144, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r145, %r2, %r3;
add.s32 %r146, %r145, 31;
shr.s32 %r147, %r146, 31;
shr.u32 %r148, %r147, 27;
add.s32 %r149, %r146, %r148;
shr.u32 %r150, %r149, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r151, %r4, %r150;
shl.b32 %r152, %r151, 8;
cvt.u64.u32 %rd1, %r152;
mul.lo.s32 %r153, %r4, %r2;
shl.b32 %r154, %r153, 4;
or.b32 %r155, %r154, 15;
and.b32 %r5, %r155, -16;
add.s32 %r156, %r155, %r5;
and.b32 %r157, %r156, -16;
cvt.s64.s32 %rd2, %r157;
mov.u64 %rd44, _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_4_cu_840dfd4b_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
cvt.rn.f64.s32 %fd1, %r107;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r158, %r7, 1;
setp.lt.s32 %p7, %r158, %r107;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r159, smem_ptr; }
// end inline asm
shl.b32 %r162, %r6, 4;
add.s32 %r160, %r159, %r162;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r161, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r161, 0;
cp.async.ca.shared.global [%r160], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r163, %r4, 215;
div.s32 %r164, %r163, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r165, %r9, %r164;
add.s32 %r166, %r165, -1;
div.s32 %r10, %r166, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r168, %ctaid.y;
mul.lo.s32 %r169, %r10, %r4;
mul.lo.s32 %r11, %r169, %r168;
mad.lo.s32 %r170, %r2, %r8, %r6;
shl.b32 %r12, %r170, 4;
mul.lo.s32 %r171, %r107, %r8;
cvt.s64.s32 %rd53, %r171;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r172, %r11, %r107;
cvt.s64.s32 %rd6, %r172;
mul.lo.s32 %r13, %r107, %r4;
mul.lo.s32 %r14, %r10, %r168;
shl.b32 %r173, %r8, 1;
mad.lo.s32 %r174, %r173, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r174, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r177, %r176, %r15;
shr.u32 %r16, %r6, 5;
add.s32 %r178, %r177, %r16;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r6, 31;
add.s32 %r179, %r177, %r17;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r176, 8;
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r600, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
// end inline asm
add.s32 %r183, %r182, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
// end inline asm
add.s32 %r193, %r192, %r12;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r180, %r600, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r185, %r13, %r600;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r184, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r184, 0;
cp.async.ca.shared.global [%r183], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r186, %r14, %r600;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd364, 0d0000000000000000;
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r188, %r14, %r600;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
add.s32 %r190, %r14, %r600;
mad.lo.s32 %r21, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
setp.gt.s32 %p16, %r21, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
mul.lo.s32 %r191, %r21, %r116;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
setp.lt.s32 %p17, %r21, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
mul.lo.s32 %r195, %r13, %r600;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r194, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r194, 0;
cp.async.ca.shared.global [%r193], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
mul.lo.s32 %r196, %r21, %r120;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd371, %fd370;
@%p18 bra $L__BB0_22;
ld.shared.v2.f64 {%fd118, %fd119}, [%rd7];
ld.shared.v2.f64 {%fd122, %fd123}, [%rd9];
ld.shared.v2.f64 {%fd126, %fd127}, [%rd11];
mul.f64 %fd130, %fd126, %fd118;
add.f64 %fd131, %fd130, 0d0000000000000000;
sub.f64 %fd132, %fd122, %fd366;
mul.f64 %fd133, %fd367, %fd132;
fma.rn.f64 %fd134, %fd130, %fd133, 0d0000000000000000;
fma.rn.f64 %fd368, %fd133, %fd118, %fd368;
mul.f64 %fd135, %fd127, %fd119;
add.f64 %fd371, %fd131, %fd135;
sub.f64 %fd136, %fd123, %fd366;
mul.f64 %fd137, %fd367, %fd136;
fma.rn.f64 %fd370, %fd135, %fd137, %fd134;
fma.rn.f64 %fd369, %fd137, %fd119, %fd369;
$L__BB0_22:
// begin inline asm
mov.b64 {%r197,%r198}, %fd371;
// end inline asm
mov.u32 %r217, 31;
mov.u32 %r218, 16;
mov.u32 %r219, -1;
shfl.sync.bfly.b32 %r200|%p21, %r198, %r218, %r217, %r219;
shfl.sync.bfly.b32 %r199|%p22, %r197, %r218, %r217, %r219;
// begin inline asm
mov.b64 %fd139, {%r199,%r200};
// end inline asm
add.f64 %fd140, %fd371, %fd139;
// begin inline asm
mov.b64 {%r201,%r202}, %fd140;
// end inline asm
mov.u32 %r220, 8;
shfl.sync.bfly.b32 %r204|%p23, %r202, %r220, %r217, %r219;
shfl.sync.bfly.b32 %r203|%p24, %r201, %r220, %r217, %r219;
// begin inline asm
mov.b64 %fd141, {%r203,%r204};
// end inline asm
add.f64 %fd142, %fd140, %fd141;
// begin inline asm
mov.b64 {%r205,%r206}, %fd142;
// end inline asm
mov.u32 %r221, 4;
shfl.sync.bfly.b32 %r208|%p25, %r206, %r221, %r217, %r219;
shfl.sync.bfly.b32 %r207|%p26, %r205, %r221, %r217, %r219;
// begin inline asm
mov.b64 %fd143, {%r207,%r208};
// end inline asm
add.f64 %fd144, %fd142, %fd143;
// begin inline asm
mov.b64 {%r209,%r210}, %fd144;
// end inline asm
mov.u32 %r222, 2;
shfl.sync.bfly.b32 %r212|%p27, %r210, %r222, %r217, %r219;
shfl.sync.bfly.b32 %r211|%p28, %r209, %r222, %r217, %r219;
// begin inline asm
mov.b64 %fd145, {%r211,%r212};
// end inline asm
add.f64 %fd146, %fd144, %fd145;
// begin inline asm
mov.b64 {%r213,%r214}, %fd146;
// end inline asm
mov.u32 %r223, 1;
shfl.sync.bfly.b32 %r216|%p29, %r214, %r223, %r217, %r219;
shfl.sync.bfly.b32 %r215|%p30, %r213, %r223, %r217, %r219;
// begin inline asm
mov.b64 %fd147, {%r215,%r216};
// end inline asm
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
setp.ne.s32 %p31, %r17, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
setp.ne.s32 %p32, %r16, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
setp.ge.u32 %p33, %r17, %r15;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
$L__BB0_27:
// begin inline asm
mov.b64 {%r224,%r225}, %fd372;
// end inline asm
mov.u32 %r244, 31;
mov.u32 %r245, 16;
mov.u32 %r246, -1;
shfl.sync.bfly.b32 %r227|%p34, %r225, %r245, %r244, %r246;
shfl.sync.bfly.b32 %r226|%p35, %r224, %r245, %r244, %r246;
// begin inline asm
mov.b64 %fd150, {%r226,%r227};
// end inline asm
add.f64 %fd151, %fd372, %fd150;
// begin inline asm
mov.b64 {%r228,%r229}, %fd151;
// end inline asm
mov.u32 %r247, 8;
shfl.sync.bfly.b32 %r231|%p36, %r229, %r247, %r244, %r246;
shfl.sync.bfly.b32 %r230|%p37, %r228, %r247, %r244, %r246;
// begin inline asm
mov.b64 %fd152, {%r230,%r231};
// end inline asm
add.f64 %fd153, %fd151, %fd152;
// begin inline asm
mov.b64 {%r232,%r233}, %fd153;
// end inline asm
mov.u32 %r248, 4;
shfl.sync.bfly.b32 %r235|%p38, %r233, %r248, %r244, %r246;
shfl.sync.bfly.b32 %r234|%p39, %r232, %r248, %r244, %r246;
// begin inline asm
mov.b64 %fd154, {%r234,%r235};
// end inline asm
add.f64 %fd155, %fd153, %fd154;
// begin inline asm
mov.b64 {%r236,%r237}, %fd155;
// end inline asm
mov.u32 %r249, 2;
shfl.sync.bfly.b32 %r239|%p40, %r237, %r249, %r244, %r246;
shfl.sync.bfly.b32 %r238|%p41, %r236, %r249, %r244, %r246;
// begin inline asm
mov.b64 %fd156, {%r238,%r239};
// end inline asm
add.f64 %fd157, %fd155, %fd156;
// begin inline asm
mov.b64 {%r240,%r241}, %fd157;
// end inline asm
mov.u32 %r250, 1;
shfl.sync.bfly.b32 %r243|%p42, %r241, %r250, %r244, %r246;
shfl.sync.bfly.b32 %r242|%p43, %r240, %r250, %r244, %r246;
// begin inline asm
mov.b64 %fd158, {%r242,%r243};
// end inline asm
add.f64 %fd373, %fd157, %fd158;
$L__BB0_28:
bar.sync 0;
// begin inline asm
mov.b64 {%r251,%r252}, %fd370;
// end inline asm
mov.u32 %r271, 31;
mov.u32 %r272, 16;
mov.u32 %r273, -1;
shfl.sync.bfly.b32 %r254|%p44, %r252, %r272, %r271, %r273;
shfl.sync.bfly.b32 %r253|%p45, %r251, %r272, %r271, %r273;
// begin inline asm
mov.b64 %fd160, {%r253,%r254};
// end inline asm
add.f64 %fd161, %fd370, %fd160;
// begin inline asm
mov.b64 {%r255,%r256}, %fd161;
// end inline asm
mov.u32 %r274, 8;
shfl.sync.bfly.b32 %r258|%p46, %r256, %r274, %r271, %r273;
shfl.sync.bfly.b32 %r257|%p47, %r255, %r274, %r271, %r273;
// begin inline asm
mov.b64 %fd162, {%r257,%r258};
// end inline asm
add.f64 %fd163, %fd161, %fd162;
// begin inline asm
mov.b64 {%r259,%r260}, %fd163;
// end inline asm
mov.u32 %r275, 4;
shfl.sync.bfly.b32 %r262|%p48, %r260, %r275, %r271, %r273;
shfl.sync.bfly.b32 %r261|%p49, %r259, %r275, %r271, %r273;
// begin inline asm
mov.b64 %fd164, {%r261,%r262};
// end inline asm
add.f64 %fd165, %fd163, %fd164;
// begin inline asm
mov.b64 {%r263,%r264}, %fd165;
// end inline asm
mov.u32 %r276, 2;
shfl.sync.bfly.b32 %r266|%p50, %r264, %r276, %r271, %r273;
shfl.sync.bfly.b32 %r265|%p51, %r263, %r276, %r271, %r273;
// begin inline asm
mov.b64 %fd166, {%r265,%r266};
// end inline asm
add.f64 %fd167, %fd165, %fd166;
// begin inline asm
mov.b64 {%r267,%r268}, %fd167;
// end inline asm
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r270|%p52, %r268, %r277, %r271, %r273;
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
// begin inline asm
mov.b64 %fd168, {%r269,%r270};
// end inline asm
add.f64 %fd375, %fd167, %fd168;
setp.eq.s32 %p4, %r17, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
$L__BB0_30:
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
setp.ge.u32 %p56, %r17, %r15;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
$L__BB0_33:
// begin inline asm
mov.b64 {%r278,%r279}, %fd374;
// end inline asm
mov.u32 %r298, 31;
mov.u32 %r299, 16;
mov.u32 %r300, -1;
shfl.sync.bfly.b32 %r281|%p57, %r279, %r299, %r298, %r300;
shfl.sync.bfly.b32 %r280|%p58, %r278, %r299, %r298, %r300;
// begin inline asm
mov.b64 %fd172, {%r280,%r281};
// end inline asm
add.f64 %fd173, %fd374, %fd172;
// begin inline asm
mov.b64 {%r282,%r283}, %fd173;
// end inline asm
mov.u32 %r301, 8;
shfl.sync.bfly.b32 %r285|%p59, %r283, %r301, %r298, %r300;
shfl.sync.bfly.b32 %r284|%p60, %r282, %r301, %r298, %r300;
// begin inline asm
mov.b64 %fd174, {%r284,%r285};
// end inline asm
add.f64 %fd175, %fd173, %fd174;
// begin inline asm
mov.b64 {%r286,%r287}, %fd175;
// end inline asm
mov.u32 %r302, 4;
shfl.sync.bfly.b32 %r289|%p61, %r287, %r302, %r298, %r300;
shfl.sync.bfly.b32 %r288|%p62, %r286, %r302, %r298, %r300;
// begin inline asm
mov.b64 %fd176, {%r288,%r289};
// end inline asm
add.f64 %fd177, %fd175, %fd176;
// begin inline asm
mov.b64 {%r290,%r291}, %fd177;
// end inline asm
mov.u32 %r303, 2;
shfl.sync.bfly.b32 %r293|%p63, %r291, %r303, %r298, %r300;
shfl.sync.bfly.b32 %r292|%p64, %r290, %r303, %r298, %r300;
// begin inline asm
mov.b64 %fd178, {%r292,%r293};
// end inline asm
add.f64 %fd179, %fd177, %fd178;
// begin inline asm
mov.b64 {%r294,%r295}, %fd179;
// end inline asm
mov.u32 %r304, 1;
shfl.sync.bfly.b32 %r297|%p65, %r295, %r304, %r298, %r300;
shfl.sync.bfly.b32 %r296|%p66, %r294, %r304, %r298, %r300;
// begin inline asm
mov.b64 %fd180, {%r296,%r297};
// end inline asm
add.f64 %fd375, %fd179, %fd180;
$L__BB0_34:
bar.sync 0;
setp.ne.s32 %p67, %r6, 0;
@%p67 bra $L__BB0_36;
st.shared.f64 [%rd12], %fd33;
$L__BB0_36:
bar.sync 0;
ld.shared.f64 %fd38, [%rd12];
bar.sync 0;
@%p67 bra $L__BB0_38;
add.f64 %fd181, %fd375, 0d0000000000000000;
selp.f64 %fd182, %fd181, 0d0000000000000000, %p4;
st.shared.f64 [%rd12], %fd182;
$L__BB0_38:
bar.sync 0;
ld.shared.f64 %fd39, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_40;
mul.f64 %fd183, %fd2, %fd367;
ld.shared.v2.f64 {%fd184, %fd185}, [%rd9];
ld.shared.v2.f64 {%fd188, %fd189}, [%rd11];
ld.shared.v2.f64 {%fd192, %fd193}, [%rd7];
mul.f64 %fd196, %fd188, %fd192;
mul.f64 %fd197, %fd196, %fd1;
sub.f64 %fd198, %fd184, %fd366;
mul.f64 %fd199, %fd367, %fd198;
sub.f64 %fd200, %fd197, %fd38;
mul.f64 %fd201, %fd39, %fd199;
sub.f64 %fd202, %fd200, %fd201;
mul.f64 %fd203, %fd183, %fd202;
mov.b64 %rd80, %fd203;
mul.f64 %fd204, %fd189, %fd193;
mul.f64 %fd205, %fd204, %fd1;
sub.f64 %fd206, %fd185, %fd366;
mul.f64 %fd207, %fd367, %fd206;
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
mad.lo.s32 %r309, %r21, %r107, %r7;
mul.wide.s32 %rd82, %r309, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
// end inline asm
$L__BB0_40:
add.s32 %r600, %r600, 1;
setp.lt.s32 %p71, %r600, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
mov.u32 %r310, %tid.z;
mad.lo.s32 %r23, %r310, %r4, %r8;
mad.lo.s32 %r24, %r23, %r3, %r6;
mul.wide.u32 %rd83, %r24, 8;
add.s64 %rd22, %rd44, %rd83;
clz.b32 %r311, %r4;
mov.u32 %r312, 31;
sub.s32 %r313, %r312, %r311;
mov.u32 %r314, 1;
shl.b32 %r25, %r314, %r313;
setp.lt.u32 %p72, %r8, %r25;
add.s32 %r315, %r25, %r8;
setp.lt.u32 %p73, %r315, %r4;
and.pred %p5, %p72, %p73;
shl.b32 %r316, %r3, %r313;
add.s32 %r317, %r24, %r316;
mul.wide.s32 %rd85, %r317, 8;
add.s64 %rd23, %rd44, %rd85;
shr.u32 %r318, %r25, 31;
add.s32 %r319, %r25, %r318;
shr.s32 %r604, %r319, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
ld.shared.f64 %fd212, [%rd23];
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
setp.lt.s32 %p75, %r25, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
mov.u32 %r601, %r604;
$L__BB0_45:
setp.ge.u32 %p76, %r8, %r601;
@%p76 bra $L__BB0_47;
mad.lo.s32 %r320, %r601, %r3, %r24;
mul.wide.s32 %rd86, %r320, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
shr.u32 %r28, %r601, 1;
setp.gt.u32 %p77, %r601, 3;
mov.u32 %r601, %r28;
@%p77 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r321, %r24, %r3;
mul.wide.u32 %rd89, %r321, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f64 %fd219, [%rd22];
add.f64 %fd380, %fd219, 0d0000000000000000;
@%p79 bra $L__BB0_51;
ld.shared.f64 %fd220, [%rd24];
add.f64 %fd380, %fd380, %fd220;
$L__BB0_51:
bar.sync 0;
st.shared.f64 [%rd22], %fd369;
bar.sync 0;
@%p74 bra $L__BB0_53;
ld.shared.f64 %fd221, [%rd23];
ld.shared.f64 %fd222, [%rd22];
add.f64 %fd223, %fd221, %fd222;
st.shared.f64 [%rd22], %fd223;
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
mov.u32 %r602, %r604;
$L__BB0_55:
setp.ge.u32 %p82, %r8, %r602;
@%p82 bra $L__BB0_57;
mad.lo.s32 %r322, %r602, %r3, %r24;
mul.wide.s32 %rd91, %r322, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
shr.u32 %r30, %r602, 1;
setp.gt.u32 %p83, %r602, 3;
mov.u32 %r602, %r30;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f64 %fd228, [%rd22];
add.f64 %fd381, %fd228, 0d0000000000000000;
@%p85 bra $L__BB0_61;
ld.shared.f64 %fd229, [%rd24];
add.f64 %fd381, %fd381, %fd229;
$L__BB0_61:
bar.sync 0;
st.shared.f64 [%rd22], %fd378;
bar.sync 0;
@%p74 bra $L__BB0_63;
ld.shared.f64 %fd230, [%rd23];
ld.shared.f64 %fd231, [%rd22];
add.f64 %fd232, %fd230, %fd231;
st.shared.f64 [%rd22], %fd232;
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
mov.u32 %r603, %r604;
$L__BB0_65:
setp.ge.u32 %p88, %r8, %r603;
@%p88 bra $L__BB0_67;
mad.lo.s32 %r323, %r603, %r3, %r24;
mul.wide.s32 %rd94, %r323, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
shr.u32 %r32, %r603, 1;
setp.gt.u32 %p89, %r603, 3;
mov.u32 %r603, %r32;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f64 %fd237, [%rd22];
add.f64 %fd382, %fd237, 0d0000000000000000;
@%p91 bra $L__BB0_71;
ld.shared.f64 %fd238, [%rd24];
add.f64 %fd382, %fd382, %fd238;
$L__BB0_71:
bar.sync 0;
st.shared.f64 [%rd22], %fd379;
bar.sync 0;
@%p74 bra $L__BB0_73;
ld.shared.f64 %fd239, [%rd23];
ld.shared.f64 %fd240, [%rd22];
add.f64 %fd241, %fd239, %fd240;
st.shared.f64 [%rd22], %fd241;
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
setp.ge.u32 %p94, %r8, %r604;
@%p94 bra $L__BB0_76;
mad.lo.s32 %r324, %r604, %r3, %r24;
mul.wide.s32 %rd97, %r324, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
shr.u32 %r34, %r604, 1;
setp.gt.u32 %p95, %r604, 3;
mov.u32 %r604, %r34;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
setp.lt.u32 %p97, %r4, 2;
ld.shared.f64 %fd246, [%rd22];
add.f64 %fd383, %fd246, 0d0000000000000000;
@%p97 bra $L__BB0_80;
ld.shared.f64 %fd247, [%rd24];
add.f64 %fd383, %fd383, %fd247;
$L__BB0_80:
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
mov.u32 %r333, %ctaid.y;
mad.lo.s32 %r334, %r107, %r333, %r7;
mul.wide.s32 %rd102, %r334, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
mov.b64 {%r325, %r326}, %rd103;
mov.b64 %rd104, %fd381;
mov.b64 {%r327, %r328}, %rd104;
// begin inline asm
st.volatile.global.v4.s32 [%rd100], {%r325,%r326,%r327,%r328};
// end inline asm
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
mov.b64 {%r329, %r330}, %rd105;
mov.b64 %rd106, %fd383;
mov.b64 {%r331, %r332}, %rd106;
// begin inline asm
st.volatile.global.v4.s32 [%rd101], {%r329,%r330,%r331,%r332};
// end inline asm
$L__BB0_82:
mov.u32 %r35, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r335, %r6, %r8;
or.b32 %r337, %r335, %r310;
setp.ne.s32 %p98, %r337, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
mov.u32 %r338, %ctaid.x;
mov.u32 %r339, %ctaid.z;
mov.u32 %r340, %nctaid.x;
mad.lo.s32 %r341, %r339, %r340, %r338;
mul.wide.s32 %rd108, %r341, 8;
add.s64 %rd27, %rd107, %rd108;
add.s32 %r342, %r9, -1;
setp.eq.s32 %p99, %r35, %r342;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
mov.u32 %r605, 8;
$L__BB0_85:
// begin inline asm
nanosleep.u32 %r605;
// end inline asm
setp.lt.u32 %p101, %r605, 256;
selp.u32 %r345, 1, 0, %p101;
shl.b32 %r605, %r605, %r345;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
add.s32 %r346, %r4, %r2;
add.s32 %r347, %r346, -1;
div.s32 %r348, %r347, %r4;
add.s32 %r349, %r9, %r348;
add.s32 %r350, %r349, -1;
div.s32 %r38, %r350, %r9;
setp.lt.s32 %p103, %r38, 1;
@%p103 bra $L__BB0_133;
add.s32 %r352, %r9, %r3;
add.s32 %r353, %r352, -1;
shl.b32 %r39, %r8, 1;
shl.b32 %r354, %r4, 1;
mad.lo.s32 %r42, %r354, %r35, %r39;
or.b32 %r40, %r42, 1;
mul.lo.s32 %r41, %r354, %r9;
shr.u32 %r43, %r3, 5;
mul.lo.s32 %r355, %r23, %r43;
shr.u32 %r44, %r6, 5;
add.s32 %r356, %r355, %r44;
mul.wide.u32 %rd117, %r356, 8;
add.s64 %rd29, %rd44, %rd117;
and.b32 %r45, %r6, 31;
add.s32 %r357, %r355, %r45;
mul.wide.u32 %rd119, %r357, 8;
add.s64 %rd30, %rd44, %rd119;
div.s32 %r46, %r353, %r3;
mov.u32 %r606, 0;
$L__BB0_88:
.pragma "nounroll";
setp.lt.s32 %p104, %r46, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
mul.lo.s32 %r359, %r41, %r606;
add.s32 %r48, %r40, %r359;
add.s32 %r49, %r42, %r359;
mov.u32 %r607, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
setp.ge.s32 %p105, %r48, %r107;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
mad.lo.s32 %r51, %r607, %r3, %r6;
setp.ge.s32 %p106, %r51, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
mad.lo.s32 %r364, %r51, %r107, %r49;
mul.wide.s32 %rd121, %r364, 8;
add.s64 %rd120, %rd42, %rd121;
// begin inline asm
ld.volatile.global.v4.s32 {%r360,%r361,%r362,%r363}, [%rd120];
// end inline asm
mov.b64 %rd122, {%r360, %r361};
mov.b64 %fd387, %rd122;
mov.b64 %rd123, {%r362, %r363};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
add.s32 %r607, %r607, 1;
setp.lt.s32 %p107, %r607, %r46;
@%p107 bra $L__BB0_90;
$L__BB0_94:
// begin inline asm
mov.b64 {%r365,%r366}, %fd389;
// end inline asm
mov.u32 %r385, 31;
mov.u32 %r386, 16;
mov.u32 %r387, -1;
shfl.sync.bfly.b32 %r368|%p108, %r366, %r386, %r385, %r387;
shfl.sync.bfly.b32 %r367|%p109, %r365, %r386, %r385, %r387;
// begin inline asm
mov.b64 %fd257, {%r367,%r368};
// end inline asm
add.f64 %fd258, %fd389, %fd257;
// begin inline asm
mov.b64 {%r369,%r370}, %fd258;
// end inline asm
mov.u32 %r388, 8;
shfl.sync.bfly.b32 %r372|%p110, %r370, %r388, %r385, %r387;
shfl.sync.bfly.b32 %r371|%p111, %r369, %r388, %r385, %r387;
// begin inline asm
mov.b64 %fd259, {%r371,%r372};
// end inline asm
add.f64 %fd260, %fd258, %fd259;
// begin inline asm
mov.b64 {%r373,%r374}, %fd260;
// end inline asm
mov.u32 %r389, 4;
shfl.sync.bfly.b32 %r376|%p112, %r374, %r389, %r385, %r387;
shfl.sync.bfly.b32 %r375|%p113, %r373, %r389, %r385, %r387;
// begin inline asm
mov.b64 %fd261, {%r375,%r376};
// end inline asm
add.f64 %fd262, %fd260, %fd261;
// begin inline asm
mov.b64 {%r377,%r378}, %fd262;
// end inline asm
mov.u32 %r390, 2;
shfl.sync.bfly.b32 %r380|%p114, %r378, %r390, %r385, %r387;
shfl.sync.bfly.b32 %r379|%p115, %r377, %r390, %r385, %r387;
// begin inline asm
mov.b64 %fd263, {%r379,%r380};
// end inline asm
add.f64 %fd264, %fd262, %fd263;
// begin inline asm
mov.b64 {%r381,%r382}, %fd264;
// end inline asm
mov.u32 %r391, 1;
shfl.sync.bfly.b32 %r384|%p116, %r382, %r391, %r385, %r387;
shfl.sync.bfly.b32 %r383|%p117, %r381, %r391, %r385, %r387;
// begin inline asm
mov.b64 %fd265, {%r383,%r384};
// end inline asm
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
setp.ne.s32 %p118, %r45, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
setp.ne.s32 %p119, %r44, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
setp.ge.u32 %p120, %r45, %r43;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
// begin inline asm
mov.b64 {%r392,%r393}, %fd390;
// end inline asm
mov.u32 %r412, 31;
mov.u32 %r413, 16;
mov.u32 %r414, -1;
shfl.sync.bfly.b32 %r395|%p121, %r393, %r413, %r412, %r414;
shfl.sync.bfly.b32 %r394|%p122, %r392, %r413, %r412, %r414;
// begin inline asm
mov.b64 %fd268, {%r394,%r395};
// end inline asm
add.f64 %fd269, %fd390, %fd268;
// begin inline asm
mov.b64 {%r396,%r397}, %fd269;
// end inline asm
mov.u32 %r415, 8;
shfl.sync.bfly.b32 %r399|%p123, %r397, %r415, %r412, %r414;
shfl.sync.bfly.b32 %r398|%p124, %r396, %r415, %r412, %r414;
// begin inline asm
mov.b64 %fd270, {%r398,%r399};
// end inline asm
add.f64 %fd271, %fd269, %fd270;
// begin inline asm
mov.b64 {%r400,%r401}, %fd271;
// end inline asm
mov.u32 %r416, 4;
shfl.sync.bfly.b32 %r403|%p125, %r401, %r416, %r412, %r414;
shfl.sync.bfly.b32 %r402|%p126, %r400, %r416, %r412, %r414;
// begin inline asm
mov.b64 %fd272, {%r402,%r403};
// end inline asm
add.f64 %fd273, %fd271, %fd272;
// begin inline asm
mov.b64 {%r404,%r405}, %fd273;
// end inline asm
mov.u32 %r417, 2;
shfl.sync.bfly.b32 %r407|%p127, %r405, %r417, %r412, %r414;
shfl.sync.bfly.b32 %r406|%p128, %r404, %r417, %r412, %r414;
// begin inline asm
mov.b64 %fd274, {%r406,%r407};
// end inline asm
add.f64 %fd275, %fd273, %fd274;
// begin inline asm
mov.b64 {%r408,%r409}, %fd275;
// end inline asm
mov.u32 %r418, 1;
shfl.sync.bfly.b32 %r411|%p129, %r409, %r418, %r412, %r414;
shfl.sync.bfly.b32 %r410|%p130, %r408, %r418, %r412, %r414;
// begin inline asm
mov.b64 %fd276, {%r410,%r411};
// end inline asm
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
setp.eq.s32 %p132, %r45, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r419,%r420}, %fd388;
// end inline asm
mov.u32 %r439, 31;
mov.u32 %r440, 16;
mov.u32 %r441, -1;
shfl.sync.bfly.b32 %r422|%p133, %r420, %r440, %r439, %r441;
shfl.sync.bfly.b32 %r421|%p134, %r419, %r440, %r439, %r441;
// begin inline asm
mov.b64 %fd278, {%r421,%r422};
// end inline asm
add.f64 %fd279, %fd388, %fd278;
// begin inline asm
mov.b64 {%r423,%r424}, %fd279;
// end inline asm
mov.u32 %r442, 8;
shfl.sync.bfly.b32 %r426|%p135, %r424, %r442, %r439, %r441;
shfl.sync.bfly.b32 %r425|%p136, %r423, %r442, %r439, %r441;
// begin inline asm
mov.b64 %fd280, {%r425,%r426};
// end inline asm
add.f64 %fd281, %fd279, %fd280;
// begin inline asm
mov.b64 {%r427,%r428}, %fd281;
// end inline asm
mov.u32 %r443, 4;
shfl.sync.bfly.b32 %r430|%p137, %r428, %r443, %r439, %r441;
shfl.sync.bfly.b32 %r429|%p138, %r427, %r443, %r439, %r441;
// begin inline asm
mov.b64 %fd282, {%r429,%r430};
// end inline asm
add.f64 %fd283, %fd281, %fd282;
// begin inline asm
mov.b64 {%r431,%r432}, %fd283;
// end inline asm
mov.u32 %r444, 2;
shfl.sync.bfly.b32 %r434|%p139, %r432, %r444, %r439, %r441;
shfl.sync.bfly.b32 %r433|%p140, %r431, %r444, %r439, %r441;
// begin inline asm
mov.b64 %fd284, {%r433,%r434};
// end inline asm
add.f64 %fd285, %fd283, %fd284;
// begin inline asm
mov.b64 {%r435,%r436}, %fd285;
// end inline asm
mov.u32 %r445, 1;
shfl.sync.bfly.b32 %r438|%p141, %r436, %r445, %r439, %r441;
shfl.sync.bfly.b32 %r437|%p142, %r435, %r445, %r439, %r441;
// begin inline asm
mov.b64 %fd286, {%r437,%r438};
// end inline asm
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
st.shared.f64 [%rd29], %fd393;
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
setp.ge.u32 %p144, %r45, %r43;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
// begin inline asm
mov.b64 {%r446,%r447}, %fd392;
// end inline asm
mov.u32 %r466, 31;
mov.u32 %r467, 16;
mov.u32 %r468, -1;
shfl.sync.bfly.b32 %r449|%p145, %r447, %r467, %r466, %r468;
shfl.sync.bfly.b32 %r448|%p146, %r446, %r467, %r466, %r468;
// begin inline asm
mov.b64 %fd290, {%r448,%r449};
// end inline asm
add.f64 %fd291, %fd392, %fd290;
// begin inline asm
mov.b64 {%r450,%r451}, %fd291;
// end inline asm
mov.u32 %r469, 8;
shfl.sync.bfly.b32 %r453|%p147, %r451, %r469, %r466, %r468;
shfl.sync.bfly.b32 %r452|%p148, %r450, %r469, %r466, %r468;
// begin inline asm
mov.b64 %fd292, {%r452,%r453};
// end inline asm
add.f64 %fd293, %fd291, %fd292;
// begin inline asm
mov.b64 {%r454,%r455}, %fd293;
// end inline asm
mov.u32 %r470, 4;
shfl.sync.bfly.b32 %r457|%p149, %r455, %r470, %r466, %r468;
shfl.sync.bfly.b32 %r456|%p150, %r454, %r470, %r466, %r468;
// begin inline asm
mov.b64 %fd294, {%r456,%r457};
// end inline asm
add.f64 %fd295, %fd293, %fd294;
// begin inline asm
mov.b64 {%r458,%r459}, %fd295;
// end inline asm
mov.u32 %r471, 2;
shfl.sync.bfly.b32 %r461|%p151, %r459, %r471, %r466, %r468;
shfl.sync.bfly.b32 %r460|%p152, %r458, %r471, %r466, %r468;
// begin inline asm
mov.b64 %fd296, {%r460,%r461};
// end inline asm
add.f64 %fd297, %fd295, %fd296;
// begin inline asm
mov.b64 {%r462,%r463}, %fd297;
// end inline asm
mov.u32 %r472, 1;
shfl.sync.bfly.b32 %r465|%p153, %r463, %r472, %r466, %r468;
shfl.sync.bfly.b32 %r464|%p154, %r462, %r472, %r466, %r468;
// begin inline asm
mov.b64 %fd298, {%r464,%r465};
// end inline asm
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
mul.lo.s32 %r53, %r41, %r606;
add.s32 %r473, %r40, %r53;
setp.ge.s32 %p156, %r473, %r107;
@%p156 bra $L__BB0_109;
add.s32 %r478, %r42, %r53;
mul.wide.s32 %rd125, %r478, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
mov.b64 {%r474, %r475}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
mov.b64 {%r476, %r477}, %rd127;
// begin inline asm
st.global.cs.v4.s32 [%rd124], {%r474,%r475,%r476,%r477};
// end inline asm
$L__BB0_109:
add.s32 %r606, %r606, 1;
setp.lt.s32 %p158, %r606, %r38;
@%p158 bra $L__BB0_88;
mad.lo.s32 %r55, %r107, %r6, %r39;
shl.b32 %r56, %r35, 1;
shl.b32 %r57, %r9, 1;
mul.lo.s32 %r58, %r107, %r3;
mov.u32 %r608, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
mad.lo.s32 %r60, %r41, %r608, %r40;
mad.lo.s32 %r481, %r57, %r608, %r56;
mad.lo.s32 %r610, %r4, %r481, %r55;
mov.u32 %r611, 0;
mov.f64 %fd304, 0d0000000000000000;
mov.u32 %r609, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
setp.ge.s32 %p160, %r60, %r107;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
setp.ge.s32 %p161, %r609, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
mul.wide.s32 %rd129, %r610, 8;
add.s64 %rd128, %rd41, %rd129;
// begin inline asm
ld.volatile.global.v4.s32 {%r482,%r483,%r484,%r485}, [%rd128];
// end inline asm
mov.b64 %rd130, {%r482, %r483};
mov.b64 %fd397, %rd130;
mov.b64 %rd131, {%r484, %r485};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
add.s32 %r610, %r610, %r58;
add.s32 %r609, %r609, %r3;
add.s32 %r611, %r611, 1;
setp.lt.s32 %p162, %r611, %r46;
@%p162 bra $L__BB0_113;
$L__BB0_117:
// begin inline asm
mov.b64 {%r486,%r487}, %fd399;
// end inline asm
mov.u32 %r506, 31;
mov.u32 %r507, 16;
mov.u32 %r508, -1;
shfl.sync.bfly.b32 %r489|%p163, %r487, %r507, %r506, %r508;
shfl.sync.bfly.b32 %r488|%p164, %r486, %r507, %r506, %r508;
// begin inline asm
mov.b64 %fd310, {%r488,%r489};
// end inline asm
add.f64 %fd311, %fd399, %fd310;
// begin inline asm
mov.b64 {%r490,%r491}, %fd311;
// end inline asm
mov.u32 %r509, 8;
shfl.sync.bfly.b32 %r493|%p165, %r491, %r509, %r506, %r508;
shfl.sync.bfly.b32 %r492|%p166, %r490, %r509, %r506, %r508;
// begin inline asm
mov.b64 %fd312, {%r492,%r493};
// end inline asm
add.f64 %fd313, %fd311, %fd312;
// begin inline asm
mov.b64 {%r494,%r495}, %fd313;
// end inline asm
mov.u32 %r510, 4;
shfl.sync.bfly.b32 %r497|%p167, %r495, %r510, %r506, %r508;
shfl.sync.bfly.b32 %r496|%p168, %r494, %r510, %r506, %r508;
// begin inline asm
mov.b64 %fd314, {%r496,%r497};
// end inline asm
add.f64 %fd315, %fd313, %fd314;
// begin inline asm
mov.b64 {%r498,%r499}, %fd315;
// end inline asm
mov.u32 %r511, 2;
shfl.sync.bfly.b32 %r501|%p169, %r499, %r511, %r506, %r508;
shfl.sync.bfly.b32 %r500|%p170, %r498, %r511, %r506, %r508;
// begin inline asm
mov.b64 %fd316, {%r500,%r501};
// end inline asm
add.f64 %fd317, %fd315, %fd316;
// begin inline asm
mov.b64 {%r502,%r503}, %fd317;
// end inline asm
mov.u32 %r512, 1;
shfl.sync.bfly.b32 %r505|%p171, %r503, %r512, %r506, %r508;
shfl.sync.bfly.b32 %r504|%p172, %r502, %r512, %r506, %r508;
// begin inline asm
mov.b64 %fd318, {%r504,%r505};
// end inline asm
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
st.shared.f64 [%rd29], %fd401;
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
setp.ge.u32 %p175, %r45, %r43;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
// begin inline asm
mov.b64 {%r513,%r514}, %fd400;
// end inline asm
mov.u32 %r533, 31;
mov.u32 %r534, 16;
mov.u32 %r535, -1;
shfl.sync.bfly.b32 %r516|%p176, %r514, %r534, %r533, %r535;
shfl.sync.bfly.b32 %r515|%p177, %r513, %r534, %r533, %r535;
// begin inline asm
mov.b64 %fd321, {%r515,%r516};
// end inline asm
add.f64 %fd322, %fd400, %fd321;
// begin inline asm
mov.b64 {%r517,%r518}, %fd322;
// end inline asm
mov.u32 %r536, 8;
shfl.sync.bfly.b32 %r520|%p178, %r518, %r536, %r533, %r535;
shfl.sync.bfly.b32 %r519|%p179, %r517, %r536, %r533, %r535;
// begin inline asm
mov.b64 %fd323, {%r519,%r520};
// end inline asm
add.f64 %fd324, %fd322, %fd323;
// begin inline asm
mov.b64 {%r521,%r522}, %fd324;
// end inline asm
mov.u32 %r537, 4;
shfl.sync.bfly.b32 %r524|%p180, %r522, %r537, %r533, %r535;
shfl.sync.bfly.b32 %r523|%p181, %r521, %r537, %r533, %r535;
// begin inline asm
mov.b64 %fd325, {%r523,%r524};
// end inline asm
add.f64 %fd326, %fd324, %fd325;
// begin inline asm
mov.b64 {%r525,%r526}, %fd326;
// end inline asm
mov.u32 %r538, 2;
shfl.sync.bfly.b32 %r528|%p182, %r526, %r538, %r533, %r535;
shfl.sync.bfly.b32 %r527|%p183, %r525, %r538, %r533, %r535;
// begin inline asm
mov.b64 %fd327, {%r527,%r528};
// end inline asm
add.f64 %fd328, %fd326, %fd327;
// begin inline asm
mov.b64 {%r529,%r530}, %fd328;
// end inline asm
mov.u32 %r539, 1;
shfl.sync.bfly.b32 %r532|%p184, %r530, %r539, %r533, %r535;
shfl.sync.bfly.b32 %r531|%p185, %r529, %r539, %r533, %r535;
// begin inline asm
mov.b64 %fd329, {%r531,%r532};
// end inline asm
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r540,%r541}, %fd398;
// end inline asm
mov.u32 %r560, 31;
mov.u32 %r561, 16;
mov.u32 %r562, -1;
shfl.sync.bfly.b32 %r543|%p188, %r541, %r561, %r560, %r562;
shfl.sync.bfly.b32 %r542|%p189, %r540, %r561, %r560, %r562;
// begin inline asm
mov.b64 %fd331, {%r542,%r543};
// end inline asm
add.f64 %fd332, %fd398, %fd331;
// begin inline asm
mov.b64 {%r544,%r545}, %fd332;
// end inline asm
mov.u32 %r563, 8;
shfl.sync.bfly.b32 %r547|%p190, %r545, %r563, %r560, %r562;
shfl.sync.bfly.b32 %r546|%p191, %r544, %r563, %r560, %r562;
// begin inline asm
mov.b64 %fd333, {%r546,%r547};
// end inline asm
add.f64 %fd334, %fd332, %fd333;
// begin inline asm
mov.b64 {%r548,%r549}, %fd334;
// end inline asm
mov.u32 %r564, 4;
shfl.sync.bfly.b32 %r551|%p192, %r549, %r564, %r560, %r562;
shfl.sync.bfly.b32 %r550|%p193, %r548, %r564, %r560, %r562;
// begin inline asm
mov.b64 %fd335, {%r550,%r551};
// end inline asm
add.f64 %fd336, %fd334, %fd335;
// begin inline asm
mov.b64 {%r552,%r553}, %fd336;
// end inline asm
mov.u32 %r565, 2;
shfl.sync.bfly.b32 %r555|%p194, %r553, %r565, %r560, %r562;
shfl.sync.bfly.b32 %r554|%p195, %r552, %r565, %r560, %r562;
// begin inline asm
mov.b64 %fd337, {%r554,%r555};
// end inline asm
add.f64 %fd338, %fd336, %fd337;
// begin inline asm
mov.b64 {%r556,%r557}, %fd338;
// end inline asm
mov.u32 %r566, 1;
shfl.sync.bfly.b32 %r559|%p196, %r557, %r566, %r560, %r562;
shfl.sync.bfly.b32 %r558|%p197, %r556, %r566, %r560, %r562;
// begin inline asm
mov.b64 %fd339, {%r558,%r559};
// end inline asm
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
st.shared.f64 [%rd29], %fd403;
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
setp.ge.u32 %p199, %r45, %r43;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
// begin inline asm
mov.b64 {%r567,%r568}, %fd402;
// end inline asm
mov.u32 %r587, 31;
mov.u32 %r588, 16;
mov.u32 %r589, -1;
shfl.sync.bfly.b32 %r570|%p200, %r568, %r588, %r587, %r589;
shfl.sync.bfly.b32 %r569|%p201, %r567, %r588, %r587, %r589;
// begin inline asm
mov.b64 %fd343, {%r569,%r570};
// end inline asm
add.f64 %fd344, %fd402, %fd343;
// begin inline asm
mov.b64 {%r571,%r572}, %fd344;
// end inline asm
mov.u32 %r590, 8;
shfl.sync.bfly.b32 %r574|%p202, %r572, %r590, %r587, %r589;
shfl.sync.bfly.b32 %r573|%p203, %r571, %r590, %r587, %r589;
// begin inline asm
mov.b64 %fd345, {%r573,%r574};
// end inline asm
add.f64 %fd346, %fd344, %fd345;
// begin inline asm
mov.b64 {%r575,%r576}, %fd346;
// end inline asm
mov.u32 %r591, 4;
shfl.sync.bfly.b32 %r578|%p204, %r576, %r591, %r587, %r589;
shfl.sync.bfly.b32 %r577|%p205, %r575, %r591, %r587, %r589;
// begin inline asm
mov.b64 %fd347, {%r577,%r578};
// end inline asm
add.f64 %fd348, %fd346, %fd347;
// begin inline asm
mov.b64 {%r579,%r580}, %fd348;
// end inline asm
mov.u32 %r592, 2;
shfl.sync.bfly.b32 %r582|%p206, %r580, %r592, %r587, %r589;
shfl.sync.bfly.b32 %r581|%p207, %r579, %r592, %r587, %r589;
// begin inline asm
mov.b64 %fd349, {%r581,%r582};
// end inline asm
add.f64 %fd350, %fd348, %fd349;
// begin inline asm
mov.b64 {%r583,%r584}, %fd350;
// end inline asm
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r586|%p208, %r584, %r593, %r587, %r589;
shfl.sync.bfly.b32 %r585|%p209, %r583, %r593, %r587, %r589;
// begin inline asm
mov.b64 %fd351, {%r585,%r586};
// end inline asm
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
mul.lo.s32 %r68, %r41, %r608;
add.s32 %r594, %r40, %r68;
setp.ge.s32 %p211, %r594, %r107;
@%p211 bra $L__BB0_132;
add.s32 %r599, %r42, %r68;
mul.wide.s32 %rd133, %r599, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
mov.b64 {%r595, %r596}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
mov.b64 {%r597, %r598}, %rd135;
// begin inline asm
st.global.cs.v4.s32 [%rd132], {%r595,%r596,%r597,%r598};
// end inline asm
$L__BB0_132:
add.s32 %r608, %r608, 1;
setp.lt.s32 %p213, %r608, %r38;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -29,18 +29,18 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
- .reg .b32 %r<613>;
+ .reg .b32 %r<612>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
- ld.param.v2.u32 {%r107, %r108}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r117, %r118}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r121, %r122}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r106, %r107}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r116, %r117}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r120, %r121}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
@@ -48,113 +48,113 @@
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r143, %r108, 1;
- shr.u32 %r144, %r143, 31;
- add.s32 %r145, %r143, %r144;
- shr.s32 %r2, %r145, 1;
+ add.s32 %r142, %r107, 1;
+ shr.u32 %r143, %r142, 31;
+ add.s32 %r144, %r142, %r143;
+ shr.s32 %r2, %r144, 1;
mov.u32 %r3, %ntid.x;
- max.s32 %r146, %r2, %r3;
- add.s32 %r147, %r146, 31;
- shr.s32 %r148, %r147, 31;
- shr.u32 %r149, %r148, 27;
- add.s32 %r150, %r147, %r149;
- shr.u32 %r151, %r150, 5;
+ max.s32 %r145, %r2, %r3;
+ add.s32 %r146, %r145, 31;
+ shr.s32 %r147, %r146, 31;
+ shr.u32 %r148, %r147, 27;
+ add.s32 %r149, %r146, %r148;
+ shr.u32 %r150, %r149, 5;
mov.u32 %r4, %ntid.y;
- mul.lo.s32 %r152, %r4, %r151;
- shl.b32 %r153, %r152, 8;
- cvt.u64.u32 %rd1, %r153;
- mul.lo.s32 %r154, %r4, %r2;
- shl.b32 %r155, %r154, 4;
- or.b32 %r156, %r155, 15;
- and.b32 %r5, %r156, -16;
- add.s32 %r157, %r156, %r5;
- and.b32 %r158, %r157, -16;
- cvt.s64.s32 %rd2, %r158;
+ mul.lo.s32 %r151, %r4, %r150;
+ shl.b32 %r152, %r151, 8;
+ cvt.u64.u32 %rd1, %r152;
+ mul.lo.s32 %r153, %r4, %r2;
+ shl.b32 %r154, %r153, 4;
+ or.b32 %r155, %r154, 15;
+ and.b32 %r5, %r155, -16;
+ add.s32 %r156, %r155, %r5;
+ and.b32 %r157, %r156, -16;
+ cvt.s64.s32 %rd2, %r157;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
- cvt.rn.f64.s32 %fd1, %r108;
+ cvt.rn.f64.s32 %fd1, %r107;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
- or.b32 %r159, %r7, 1;
- setp.lt.s32 %p7, %r159, %r108;
+ or.b32 %r158, %r7, 1;
+ setp.lt.s32 %p7, %r158, %r107;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r160, smem_ptr; }
-
-
- shl.b32 %r163, %r6, 4;
- add.s32 %r161, %r160, %r163;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r159, smem_ptr; }
+
+
+ shl.b32 %r162, %r6, 4;
+ add.s32 %r160, %r159, %r162;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
- mov.u32 %r162, 0;
+ mov.u32 %r161, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r162, 0;
- cp.async.ca.shared.global [%r161], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r161, 0;
+ cp.async.ca.shared.global [%r160], [%rd47], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r164, %r4, 215;
- div.s32 %r165, %r164, %r4;
+ add.s32 %r163, %r4, 215;
+ div.s32 %r164, %r163, %r4;
mov.u32 %r9, %nctaid.y;
- add.s32 %r166, %r9, %r165;
- add.s32 %r167, %r166, -1;
- div.s32 %r10, %r167, %r9;
+ add.s32 %r165, %r9, %r164;
+ add.s32 %r166, %r165, -1;
+ div.s32 %r10, %r166, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
- mov.u32 %r169, %ctaid.y;
- mul.lo.s32 %r170, %r10, %r4;
- mul.lo.s32 %r11, %r170, %r169;
- shl.b32 %r171, %r8, 3;
- shl.b32 %r172, %r6, 4;
- mad.lo.s32 %r12, %r171, %r108, %r172;
- mul.lo.s32 %r173, %r108, %r8;
- cvt.s64.s32 %rd53, %r173;
+ mov.u32 %r168, %ctaid.y;
+ mul.lo.s32 %r169, %r10, %r4;
+ mul.lo.s32 %r11, %r169, %r168;
+ mad.lo.s32 %r170, %r2, %r8, %r6;
+ shl.b32 %r12, %r170, 4;
+ mul.lo.s32 %r171, %r107, %r8;
+ cvt.s64.s32 %rd53, %r171;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r174, %r11, %r108;
- cvt.s64.s32 %rd6, %r174;
- mul.lo.s32 %r13, %r108, %r4;
- mul.lo.s32 %r14, %r10, %r169;
- add.s32 %r15, %r173, %r7;
+ mul.lo.s32 %r172, %r11, %r107;
+ cvt.s64.s32 %rd6, %r172;
+ mul.lo.s32 %r13, %r107, %r4;
+ mul.lo.s32 %r14, %r10, %r168;
+ shl.b32 %r173, %r8, 1;
+ mad.lo.s32 %r174, %r173, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 8;
+ mul.wide.s32 %rd56, %r174, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r177, %r176, %r16;
- shr.u32 %r17, %r6, 5;
- add.s32 %r178, %r177, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r177, %r176, %r15;
+ shr.u32 %r16, %r6, 5;
+ add.s32 %r178, %r177, %r16;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
- and.b32 %r18, %r6, 31;
- add.s32 %r179, %r177, %r18;
+ and.b32 %r17, %r6, 31;
+ add.s32 %r179, %r177, %r17;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
@@ -162,37 +162,37 @@
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
- mov.u32 %r601, 0;
+ mov.u32 %r600, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
- add.s32 %r183, %r12, %r182;
+ add.s32 %r183, %r182, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
- add.s32 %r193, %r12, %r192;
+ add.s32 %r193, %r192, %r12;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
- mad.lo.s32 %r180, %r601, %r4, %r8;
+ mad.lo.s32 %r180, %r600, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
- mul.lo.s32 %r185, %r13, %r601;
+ mul.lo.s32 %r185, %r13, %r600;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
@@ -211,11 +211,11 @@
cp.async.wait_all;
@%p11 bra $L__BB0_10;
- add.s32 %r186, %r14, %r601;
+ add.s32 %r186, %r14, %r600;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
@@ -228,38 +228,38 @@
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
- add.s32 %r188, %r14, %r601;
+ add.s32 %r188, %r14, %r600;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
- add.s32 %r190, %r14, %r601;
- mad.lo.s32 %r22, %r190, %r4, %r8;
+ add.s32 %r190, %r14, %r600;
+ mad.lo.s32 %r21, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
- setp.gt.s32 %p16, %r22, 215;
+ setp.gt.s32 %p16, %r21, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
- mul.lo.s32 %r191, %r22, %r117;
+ mul.lo.s32 %r191, %r21, %r116;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
- setp.lt.s32 %p17, %r22, 216;
+ setp.lt.s32 %p17, %r21, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
- mul.lo.s32 %r195, %r13, %r601;
+ mul.lo.s32 %r195, %r13, %r600;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
@@ -276,11 +276,11 @@
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
- mul.lo.s32 %r196, %r22, %r121;
+ mul.lo.s32 %r196, %r21, %r120;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
@@ -359,21 +359,21 @@
mov.b64 %fd147, {%r215,%r216};
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
- setp.ne.s32 %p31, %r18, 0;
+ setp.ne.s32 %p31, %r17, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
- setp.ne.s32 %p32, %r17, 0;
+ setp.ne.s32 %p32, %r16, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
- setp.ge.u32 %p33, %r18, %r16;
+ setp.ge.u32 %p33, %r17, %r15;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
@@ -483,11 +483,11 @@
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
mov.b64 %fd168, {%r269,%r270};
add.f64 %fd375, %fd167, %fd168;
- setp.eq.s32 %p4, %r18, 0;
+ setp.eq.s32 %p4, %r17, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
@@ -495,11 +495,11 @@
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
- setp.ge.u32 %p56, %r18, %r16;
+ setp.ge.u32 %p56, %r17, %r15;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
@@ -600,54 +600,53 @@
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
- mad.lo.s32 %r309, %r601, %r4, %r11;
- mad.lo.s32 %r310, %r309, %r108, %r15;
- mul.wide.s32 %rd82, %r310, 8;
+ mad.lo.s32 %r309, %r21, %r107, %r7;
+ mul.wide.s32 %rd82, %r309, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
$L__BB0_40:
- add.s32 %r601, %r601, 1;
- setp.lt.s32 %p71, %r601, %r10;
+ add.s32 %r600, %r600, 1;
+ setp.lt.s32 %p71, %r600, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
- mov.u32 %r311, %tid.z;
- mad.lo.s32 %r24, %r311, %r4, %r8;
- mad.lo.s32 %r25, %r24, %r3, %r6;
- mul.wide.u32 %rd83, %r25, 8;
+ mov.u32 %r310, %tid.z;
+ mad.lo.s32 %r23, %r310, %r4, %r8;
+ mad.lo.s32 %r24, %r23, %r3, %r6;
+ mul.wide.u32 %rd83, %r24, 8;
add.s64 %rd22, %rd44, %rd83;
- clz.b32 %r312, %r4;
- mov.u32 %r313, 31;
- sub.s32 %r314, %r313, %r312;
- mov.u32 %r315, 1;
- shl.b32 %r26, %r315, %r314;
- setp.lt.u32 %p72, %r8, %r26;
- add.s32 %r316, %r26, %r8;
- setp.lt.u32 %p73, %r316, %r4;
+ clz.b32 %r311, %r4;
+ mov.u32 %r312, 31;
+ sub.s32 %r313, %r312, %r311;
+ mov.u32 %r314, 1;
+ shl.b32 %r25, %r314, %r313;
+ setp.lt.u32 %p72, %r8, %r25;
+ add.s32 %r315, %r25, %r8;
+ setp.lt.u32 %p73, %r315, %r4;
and.pred %p5, %p72, %p73;
- shl.b32 %r317, %r3, %r314;
- add.s32 %r318, %r25, %r317;
- mul.wide.s32 %rd85, %r318, 8;
+ shl.b32 %r316, %r3, %r313;
+ add.s32 %r317, %r24, %r316;
+ mul.wide.s32 %rd85, %r317, 8;
add.s64 %rd23, %rd44, %rd85;
- shr.u32 %r319, %r26, 31;
- add.s32 %r320, %r26, %r319;
- shr.s32 %r605, %r320, 1;
+ shr.u32 %r318, %r25, 31;
+ add.s32 %r319, %r25, %r318;
+ shr.s32 %r604, %r319, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
@@ -655,38 +654,38 @@
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
- setp.lt.s32 %p75, %r26, 4;
+ setp.lt.s32 %p75, %r25, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
- mov.u32 %r602, %r605;
+ mov.u32 %r601, %r604;
$L__BB0_45:
- setp.ge.u32 %p76, %r8, %r602;
+ setp.ge.u32 %p76, %r8, %r601;
@%p76 bra $L__BB0_47;
- mad.lo.s32 %r321, %r602, %r3, %r25;
- mul.wide.s32 %rd86, %r321, 8;
+ mad.lo.s32 %r320, %r601, %r3, %r24;
+ mul.wide.s32 %rd86, %r320, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
- shr.u32 %r29, %r602, 1;
- setp.gt.u32 %p77, %r602, 3;
- mov.u32 %r602, %r29;
+ shr.u32 %r28, %r601, 1;
+ setp.gt.u32 %p77, %r601, 3;
+ mov.u32 %r601, %r28;
@%p77 bra $L__BB0_45;
$L__BB0_48:
- add.s32 %r322, %r25, %r3;
- mul.wide.u32 %rd89, %r322, 8;
+ add.s32 %r321, %r24, %r3;
+ mul.wide.u32 %rd89, %r321, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
@@ -711,29 +710,29 @@
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
- mov.u32 %r603, %r605;
+ mov.u32 %r602, %r604;
$L__BB0_55:
- setp.ge.u32 %p82, %r8, %r603;
+ setp.ge.u32 %p82, %r8, %r602;
@%p82 bra $L__BB0_57;
- mad.lo.s32 %r323, %r603, %r3, %r25;
- mul.wide.s32 %rd91, %r323, 8;
+ mad.lo.s32 %r322, %r602, %r3, %r24;
+ mul.wide.s32 %rd91, %r322, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
- shr.u32 %r31, %r603, 1;
- setp.gt.u32 %p83, %r603, 3;
- mov.u32 %r603, %r31;
+ shr.u32 %r30, %r602, 1;
+ setp.gt.u32 %p83, %r602, 3;
+ mov.u32 %r602, %r30;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
@@ -759,29 +758,29 @@
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
- mov.u32 %r604, %r605;
+ mov.u32 %r603, %r604;
$L__BB0_65:
- setp.ge.u32 %p88, %r8, %r604;
+ setp.ge.u32 %p88, %r8, %r603;
@%p88 bra $L__BB0_67;
- mad.lo.s32 %r324, %r604, %r3, %r25;
- mul.wide.s32 %rd94, %r324, 8;
+ mad.lo.s32 %r323, %r603, %r3, %r24;
+ mul.wide.s32 %rd94, %r323, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r33, %r604, 1;
- setp.gt.u32 %p89, %r604, 3;
- mov.u32 %r604, %r33;
+ shr.u32 %r32, %r603, 1;
+ setp.gt.u32 %p89, %r603, 3;
+ mov.u32 %r603, %r32;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
@@ -808,26 +807,26 @@
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
- setp.ge.u32 %p94, %r8, %r605;
+ setp.ge.u32 %p94, %r8, %r604;
@%p94 bra $L__BB0_76;
- mad.lo.s32 %r325, %r605, %r3, %r25;
- mul.wide.s32 %rd97, %r325, 8;
+ mad.lo.s32 %r324, %r604, %r3, %r24;
+ mul.wide.s32 %rd97, %r324, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
- shr.u32 %r35, %r605, 1;
- setp.gt.u32 %p95, %r605, 3;
- mov.u32 %r605, %r35;
+ shr.u32 %r34, %r604, 1;
+ setp.gt.u32 %p95, %r604, 3;
+ mov.u32 %r604, %r34;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
@@ -844,328 +843,328 @@
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
- mov.u32 %r334, %ctaid.y;
- mad.lo.s32 %r335, %r108, %r334, %r7;
- mul.wide.s32 %rd102, %r335, 8;
+ mov.u32 %r333, %ctaid.y;
+ mad.lo.s32 %r334, %r107, %r333, %r7;
+ mul.wide.s32 %rd102, %r334, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
- mov.b64 {%r326, %r327}, %rd103;
+ mov.b64 {%r325, %r326}, %rd103;
mov.b64 %rd104, %fd381;
- mov.b64 {%r328, %r329}, %rd104;
-
- st.volatile.global.v4.s32 [%rd100], {%r326,%r327,%r328,%r329};
+ mov.b64 {%r327, %r328}, %rd104;
+
+ st.volatile.global.v4.s32 [%rd100], {%r325,%r326,%r327,%r328};
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
- mov.b64 {%r330, %r331}, %rd105;
+ mov.b64 {%r329, %r330}, %rd105;
mov.b64 %rd106, %fd383;
- mov.b64 {%r332, %r333}, %rd106;
-
- st.volatile.global.v4.s32 [%rd101], {%r330,%r331,%r332,%r333};
+ mov.b64 {%r331, %r332}, %rd106;
+
+ st.volatile.global.v4.s32 [%rd101], {%r329,%r330,%r331,%r332};
$L__BB0_82:
- mov.u32 %r36, %ctaid.y;
+ mov.u32 %r35, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r336, %r6, %r8;
- or.b32 %r338, %r336, %r311;
- setp.ne.s32 %p98, %r338, 0;
+ or.b32 %r335, %r6, %r8;
+ or.b32 %r337, %r335, %r310;
+ setp.ne.s32 %p98, %r337, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
- mov.u32 %r339, %ctaid.x;
- mov.u32 %r340, %ctaid.z;
- mov.u32 %r341, %nctaid.x;
- mad.lo.s32 %r342, %r340, %r341, %r339;
- mul.wide.s32 %rd108, %r342, 8;
+ mov.u32 %r338, %ctaid.x;
+ mov.u32 %r339, %ctaid.z;
+ mov.u32 %r340, %nctaid.x;
+ mad.lo.s32 %r341, %r339, %r340, %r338;
+ mul.wide.s32 %rd108, %r341, 8;
add.s64 %rd27, %rd107, %rd108;
- add.s32 %r343, %r9, -1;
- setp.eq.s32 %p99, %r36, %r343;
+ add.s32 %r342, %r9, -1;
+ setp.eq.s32 %p99, %r35, %r342;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
- mov.u32 %r606, 8;
+ mov.u32 %r605, 8;
$L__BB0_85:
- nanosleep.u32 %r606;
-
- setp.lt.u32 %p101, %r606, 256;
- selp.u32 %r346, 1, 0, %p101;
- shl.b32 %r606, %r606, %r346;
+ nanosleep.u32 %r605;
+
+ setp.lt.u32 %p101, %r605, 256;
+ selp.u32 %r345, 1, 0, %p101;
+ shl.b32 %r605, %r605, %r345;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
- add.s32 %r347, %r4, %r2;
- add.s32 %r348, %r347, -1;
- div.s32 %r349, %r348, %r4;
- add.s32 %r350, %r9, %r349;
- add.s32 %r351, %r350, -1;
- div.s32 %r39, %r351, %r9;
- setp.lt.s32 %p103, %r39, 1;
+ add.s32 %r346, %r4, %r2;
+ add.s32 %r347, %r346, -1;
+ div.s32 %r348, %r347, %r4;
+ add.s32 %r349, %r9, %r348;
+ add.s32 %r350, %r349, -1;
+ div.s32 %r38, %r350, %r9;
+ setp.lt.s32 %p103, %r38, 1;
@%p103 bra $L__BB0_133;
- add.s32 %r353, %r9, %r3;
- add.s32 %r354, %r353, -1;
- shl.b32 %r40, %r8, 1;
- shl.b32 %r355, %r4, 1;
- mad.lo.s32 %r43, %r355, %r36, %r40;
- or.b32 %r41, %r43, 1;
- mul.lo.s32 %r42, %r355, %r9;
- shr.u32 %r44, %r3, 5;
- mul.lo.s32 %r356, %r24, %r44;
- shr.u32 %r45, %r6, 5;
- add.s32 %r357, %r356, %r45;
- mul.wide.u32 %rd117, %r357, 8;
+ add.s32 %r352, %r9, %r3;
+ add.s32 %r353, %r352, -1;
+ shl.b32 %r39, %r8, 1;
+ shl.b32 %r354, %r4, 1;
+ mad.lo.s32 %r42, %r354, %r35, %r39;
+ or.b32 %r40, %r42, 1;
+ mul.lo.s32 %r41, %r354, %r9;
+ shr.u32 %r43, %r3, 5;
+ mul.lo.s32 %r355, %r23, %r43;
+ shr.u32 %r44, %r6, 5;
+ add.s32 %r356, %r355, %r44;
+ mul.wide.u32 %rd117, %r356, 8;
add.s64 %rd29, %rd44, %rd117;
- and.b32 %r46, %r6, 31;
- add.s32 %r358, %r356, %r46;
- mul.wide.u32 %rd119, %r358, 8;
+ and.b32 %r45, %r6, 31;
+ add.s32 %r357, %r355, %r45;
+ mul.wide.u32 %rd119, %r357, 8;
add.s64 %rd30, %rd44, %rd119;
- div.s32 %r47, %r354, %r3;
- mov.u32 %r607, 0;
+ div.s32 %r46, %r353, %r3;
+ mov.u32 %r606, 0;
$L__BB0_88:
.pragma "nounroll";
- setp.lt.s32 %p104, %r47, 1;
+ setp.lt.s32 %p104, %r46, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
- mul.lo.s32 %r360, %r42, %r607;
- add.s32 %r49, %r41, %r360;
- add.s32 %r50, %r43, %r360;
- mov.u32 %r608, 0;
+ mul.lo.s32 %r359, %r41, %r606;
+ add.s32 %r48, %r40, %r359;
+ add.s32 %r49, %r42, %r359;
+ mov.u32 %r607, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
- setp.ge.s32 %p105, %r49, %r108;
+ setp.ge.s32 %p105, %r48, %r107;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
- mad.lo.s32 %r52, %r608, %r3, %r6;
- setp.ge.s32 %p106, %r52, %r9;
+ mad.lo.s32 %r51, %r607, %r3, %r6;
+ setp.ge.s32 %p106, %r51, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
- mad.lo.s32 %r365, %r52, %r108, %r50;
- mul.wide.s32 %rd121, %r365, 8;
+ mad.lo.s32 %r364, %r51, %r107, %r49;
+ mul.wide.s32 %rd121, %r364, 8;
add.s64 %rd120, %rd42, %rd121;
- ld.volatile.global.v4.s32 {%r361,%r362,%r363,%r364}, [%rd120];
-
- mov.b64 %rd122, {%r361, %r362};
+ ld.volatile.global.v4.s32 {%r360,%r361,%r362,%r363}, [%rd120];
+
+ mov.b64 %rd122, {%r360, %r361};
mov.b64 %fd387, %rd122;
- mov.b64 %rd123, {%r363, %r364};
+ mov.b64 %rd123, {%r362, %r363};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
- add.s32 %r608, %r608, 1;
- setp.lt.s32 %p107, %r608, %r47;
+ add.s32 %r607, %r607, 1;
+ setp.lt.s32 %p107, %r607, %r46;
@%p107 bra $L__BB0_90;
$L__BB0_94:
- mov.b64 {%r366,%r367}, %fd389;
-
- mov.u32 %r386, 31;
- mov.u32 %r387, 16;
- mov.u32 %r388, -1;
- shfl.sync.bfly.b32 %r369|%p108, %r367, %r387, %r386, %r388;
- shfl.sync.bfly.b32 %r368|%p109, %r366, %r387, %r386, %r388;
-
- mov.b64 %fd257, {%r368,%r369};
+ mov.b64 {%r365,%r366}, %fd389;
+
+ mov.u32 %r385, 31;
+ mov.u32 %r386, 16;
+ mov.u32 %r387, -1;
+ shfl.sync.bfly.b32 %r368|%p108, %r366, %r386, %r385, %r387;
+ shfl.sync.bfly.b32 %r367|%p109, %r365, %r386, %r385, %r387;
+
+ mov.b64 %fd257, {%r367,%r368};
add.f64 %fd258, %fd389, %fd257;
- mov.b64 {%r370,%r371}, %fd258;
-
- mov.u32 %r389, 8;
- shfl.sync.bfly.b32 %r373|%p110, %r371, %r389, %r386, %r388;
- shfl.sync.bfly.b32 %r372|%p111, %r370, %r389, %r386, %r388;
-
- mov.b64 %fd259, {%r372,%r373};
+ mov.b64 {%r369,%r370}, %fd258;
+
+ mov.u32 %r388, 8;
+ shfl.sync.bfly.b32 %r372|%p110, %r370, %r388, %r385, %r387;
+ shfl.sync.bfly.b32 %r371|%p111, %r369, %r388, %r385, %r387;
+
+ mov.b64 %fd259, {%r371,%r372};
add.f64 %fd260, %fd258, %fd259;
- mov.b64 {%r374,%r375}, %fd260;
-
- mov.u32 %r390, 4;
- shfl.sync.bfly.b32 %r377|%p112, %r375, %r390, %r386, %r388;
- shfl.sync.bfly.b32 %r376|%p113, %r374, %r390, %r386, %r388;
-
- mov.b64 %fd261, {%r376,%r377};
+ mov.b64 {%r373,%r374}, %fd260;
+
+ mov.u32 %r389, 4;
+ shfl.sync.bfly.b32 %r376|%p112, %r374, %r389, %r385, %r387;
+ shfl.sync.bfly.b32 %r375|%p113, %r373, %r389, %r385, %r387;
+
+ mov.b64 %fd261, {%r375,%r376};
add.f64 %fd262, %fd260, %fd261;
- mov.b64 {%r378,%r379}, %fd262;
-
- mov.u32 %r391, 2;
- shfl.sync.bfly.b32 %r381|%p114, %r379, %r391, %r386, %r388;
- shfl.sync.bfly.b32 %r380|%p115, %r378, %r391, %r386, %r388;
-
- mov.b64 %fd263, {%r380,%r381};
+ mov.b64 {%r377,%r378}, %fd262;
+
+ mov.u32 %r390, 2;
+ shfl.sync.bfly.b32 %r380|%p114, %r378, %r390, %r385, %r387;
+ shfl.sync.bfly.b32 %r379|%p115, %r377, %r390, %r385, %r387;
+
+ mov.b64 %fd263, {%r379,%r380};
add.f64 %fd264, %fd262, %fd263;
- mov.b64 {%r382,%r383}, %fd264;
-
- mov.u32 %r392, 1;
- shfl.sync.bfly.b32 %r385|%p116, %r383, %r392, %r386, %r388;
- shfl.sync.bfly.b32 %r384|%p117, %r382, %r392, %r386, %r388;
-
- mov.b64 %fd265, {%r384,%r385};
+ mov.b64 {%r381,%r382}, %fd264;
+
+ mov.u32 %r391, 1;
+ shfl.sync.bfly.b32 %r384|%p116, %r382, %r391, %r385, %r387;
+ shfl.sync.bfly.b32 %r383|%p117, %r381, %r391, %r385, %r387;
+
+ mov.b64 %fd265, {%r383,%r384};
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
- setp.ne.s32 %p118, %r46, 0;
+ setp.ne.s32 %p118, %r45, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
- setp.ne.s32 %p119, %r45, 0;
+ setp.ne.s32 %p119, %r44, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
- setp.ge.u32 %p120, %r46, %r44;
+ setp.ge.u32 %p120, %r45, %r43;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
- mov.b64 {%r393,%r394}, %fd390;
-
- mov.u32 %r413, 31;
- mov.u32 %r414, 16;
- mov.u32 %r415, -1;
- shfl.sync.bfly.b32 %r396|%p121, %r394, %r414, %r413, %r415;
- shfl.sync.bfly.b32 %r395|%p122, %r393, %r414, %r413, %r415;
-
- mov.b64 %fd268, {%r395,%r396};
+ mov.b64 {%r392,%r393}, %fd390;
+
+ mov.u32 %r412, 31;
+ mov.u32 %r413, 16;
+ mov.u32 %r414, -1;
+ shfl.sync.bfly.b32 %r395|%p121, %r393, %r413, %r412, %r414;
+ shfl.sync.bfly.b32 %r394|%p122, %r392, %r413, %r412, %r414;
+
+ mov.b64 %fd268, {%r394,%r395};
add.f64 %fd269, %fd390, %fd268;
- mov.b64 {%r397,%r398}, %fd269;
-
- mov.u32 %r416, 8;
- shfl.sync.bfly.b32 %r400|%p123, %r398, %r416, %r413, %r415;
- shfl.sync.bfly.b32 %r399|%p124, %r397, %r416, %r413, %r415;
-
- mov.b64 %fd270, {%r399,%r400};
+ mov.b64 {%r396,%r397}, %fd269;
+
+ mov.u32 %r415, 8;
+ shfl.sync.bfly.b32 %r399|%p123, %r397, %r415, %r412, %r414;
+ shfl.sync.bfly.b32 %r398|%p124, %r396, %r415, %r412, %r414;
+
+ mov.b64 %fd270, {%r398,%r399};
add.f64 %fd271, %fd269, %fd270;
- mov.b64 {%r401,%r402}, %fd271;
-
- mov.u32 %r417, 4;
- shfl.sync.bfly.b32 %r404|%p125, %r402, %r417, %r413, %r415;
- shfl.sync.bfly.b32 %r403|%p126, %r401, %r417, %r413, %r415;
-
- mov.b64 %fd272, {%r403,%r404};
+ mov.b64 {%r400,%r401}, %fd271;
+
+ mov.u32 %r416, 4;
+ shfl.sync.bfly.b32 %r403|%p125, %r401, %r416, %r412, %r414;
+ shfl.sync.bfly.b32 %r402|%p126, %r400, %r416, %r412, %r414;
+
+ mov.b64 %fd272, {%r402,%r403};
add.f64 %fd273, %fd271, %fd272;
- mov.b64 {%r405,%r406}, %fd273;
-
- mov.u32 %r418, 2;
- shfl.sync.bfly.b32 %r408|%p127, %r406, %r418, %r413, %r415;
- shfl.sync.bfly.b32 %r407|%p128, %r405, %r418, %r413, %r415;
-
- mov.b64 %fd274, {%r407,%r408};
+ mov.b64 {%r404,%r405}, %fd273;
+
+ mov.u32 %r417, 2;
+ shfl.sync.bfly.b32 %r407|%p127, %r405, %r417, %r412, %r414;
+ shfl.sync.bfly.b32 %r406|%p128, %r404, %r417, %r412, %r414;
+
+ mov.b64 %fd274, {%r406,%r407};
add.f64 %fd275, %fd273, %fd274;
- mov.b64 {%r409,%r410}, %fd275;
-
- mov.u32 %r419, 1;
- shfl.sync.bfly.b32 %r412|%p129, %r410, %r419, %r413, %r415;
- shfl.sync.bfly.b32 %r411|%p130, %r409, %r419, %r413, %r415;
-
- mov.b64 %fd276, {%r411,%r412};
+ mov.b64 {%r408,%r409}, %fd275;
+
+ mov.u32 %r418, 1;
+ shfl.sync.bfly.b32 %r411|%p129, %r409, %r418, %r412, %r414;
+ shfl.sync.bfly.b32 %r410|%p130, %r408, %r418, %r412, %r414;
+
+ mov.b64 %fd276, {%r410,%r411};
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
- setp.eq.s32 %p132, %r46, 0;
+ setp.eq.s32 %p132, %r45, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
- mov.b64 {%r420,%r421}, %fd388;
-
- mov.u32 %r440, 31;
- mov.u32 %r441, 16;
- mov.u32 %r442, -1;
- shfl.sync.bfly.b32 %r423|%p133, %r421, %r441, %r440, %r442;
- shfl.sync.bfly.b32 %r422|%p134, %r420, %r441, %r440, %r442;
-
- mov.b64 %fd278, {%r422,%r423};
+ mov.b64 {%r419,%r420}, %fd388;
+
+ mov.u32 %r439, 31;
+ mov.u32 %r440, 16;
+ mov.u32 %r441, -1;
+ shfl.sync.bfly.b32 %r422|%p133, %r420, %r440, %r439, %r441;
+ shfl.sync.bfly.b32 %r421|%p134, %r419, %r440, %r439, %r441;
+
+ mov.b64 %fd278, {%r421,%r422};
add.f64 %fd279, %fd388, %fd278;
- mov.b64 {%r424,%r425}, %fd279;
-
- mov.u32 %r443, 8;
- shfl.sync.bfly.b32 %r427|%p135, %r425, %r443, %r440, %r442;
- shfl.sync.bfly.b32 %r426|%p136, %r424, %r443, %r440, %r442;
-
- mov.b64 %fd280, {%r426,%r427};
+ mov.b64 {%r423,%r424}, %fd279;
+
+ mov.u32 %r442, 8;
+ shfl.sync.bfly.b32 %r426|%p135, %r424, %r442, %r439, %r441;
+ shfl.sync.bfly.b32 %r425|%p136, %r423, %r442, %r439, %r441;
+
+ mov.b64 %fd280, {%r425,%r426};
add.f64 %fd281, %fd279, %fd280;
- mov.b64 {%r428,%r429}, %fd281;
-
- mov.u32 %r444, 4;
- shfl.sync.bfly.b32 %r431|%p137, %r429, %r444, %r440, %r442;
- shfl.sync.bfly.b32 %r430|%p138, %r428, %r444, %r440, %r442;
-
- mov.b64 %fd282, {%r430,%r431};
+ mov.b64 {%r427,%r428}, %fd281;
+
+ mov.u32 %r443, 4;
+ shfl.sync.bfly.b32 %r430|%p137, %r428, %r443, %r439, %r441;
+ shfl.sync.bfly.b32 %r429|%p138, %r427, %r443, %r439, %r441;
+
+ mov.b64 %fd282, {%r429,%r430};
add.f64 %fd283, %fd281, %fd282;
- mov.b64 {%r432,%r433}, %fd283;
-
- mov.u32 %r445, 2;
- shfl.sync.bfly.b32 %r435|%p139, %r433, %r445, %r440, %r442;
- shfl.sync.bfly.b32 %r434|%p140, %r432, %r445, %r440, %r442;
-
- mov.b64 %fd284, {%r434,%r435};
+ mov.b64 {%r431,%r432}, %fd283;
+
+ mov.u32 %r444, 2;
+ shfl.sync.bfly.b32 %r434|%p139, %r432, %r444, %r439, %r441;
+ shfl.sync.bfly.b32 %r433|%p140, %r431, %r444, %r439, %r441;
+
+ mov.b64 %fd284, {%r433,%r434};
add.f64 %fd285, %fd283, %fd284;
- mov.b64 {%r436,%r437}, %fd285;
-
- mov.u32 %r446, 1;
- shfl.sync.bfly.b32 %r439|%p141, %r437, %r446, %r440, %r442;
- shfl.sync.bfly.b32 %r438|%p142, %r436, %r446, %r440, %r442;
-
- mov.b64 %fd286, {%r438,%r439};
+ mov.b64 {%r435,%r436}, %fd285;
+
+ mov.u32 %r445, 1;
+ shfl.sync.bfly.b32 %r438|%p141, %r436, %r445, %r439, %r441;
+ shfl.sync.bfly.b32 %r437|%p142, %r435, %r445, %r439, %r441;
+
+ mov.b64 %fd286, {%r437,%r438};
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
@@ -1173,201 +1172,201 @@
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
- setp.ge.u32 %p144, %r46, %r44;
+ setp.ge.u32 %p144, %r45, %r43;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
- mov.b64 {%r447,%r448}, %fd392;
-
- mov.u32 %r467, 31;
- mov.u32 %r468, 16;
- mov.u32 %r469, -1;
- shfl.sync.bfly.b32 %r450|%p145, %r448, %r468, %r467, %r469;
- shfl.sync.bfly.b32 %r449|%p146, %r447, %r468, %r467, %r469;
-
- mov.b64 %fd290, {%r449,%r450};
+ mov.b64 {%r446,%r447}, %fd392;
+
+ mov.u32 %r466, 31;
+ mov.u32 %r467, 16;
+ mov.u32 %r468, -1;
+ shfl.sync.bfly.b32 %r449|%p145, %r447, %r467, %r466, %r468;
+ shfl.sync.bfly.b32 %r448|%p146, %r446, %r467, %r466, %r468;
+
+ mov.b64 %fd290, {%r448,%r449};
add.f64 %fd291, %fd392, %fd290;
- mov.b64 {%r451,%r452}, %fd291;
-
- mov.u32 %r470, 8;
- shfl.sync.bfly.b32 %r454|%p147, %r452, %r470, %r467, %r469;
- shfl.sync.bfly.b32 %r453|%p148, %r451, %r470, %r467, %r469;
-
- mov.b64 %fd292, {%r453,%r454};
+ mov.b64 {%r450,%r451}, %fd291;
+
+ mov.u32 %r469, 8;
+ shfl.sync.bfly.b32 %r453|%p147, %r451, %r469, %r466, %r468;
+ shfl.sync.bfly.b32 %r452|%p148, %r450, %r469, %r466, %r468;
+
+ mov.b64 %fd292, {%r452,%r453};
add.f64 %fd293, %fd291, %fd292;
- mov.b64 {%r455,%r456}, %fd293;
-
- mov.u32 %r471, 4;
- shfl.sync.bfly.b32 %r458|%p149, %r456, %r471, %r467, %r469;
- shfl.sync.bfly.b32 %r457|%p150, %r455, %r471, %r467, %r469;
-
- mov.b64 %fd294, {%r457,%r458};
+ mov.b64 {%r454,%r455}, %fd293;
+
+ mov.u32 %r470, 4;
+ shfl.sync.bfly.b32 %r457|%p149, %r455, %r470, %r466, %r468;
+ shfl.sync.bfly.b32 %r456|%p150, %r454, %r470, %r466, %r468;
+
+ mov.b64 %fd294, {%r456,%r457};
add.f64 %fd295, %fd293, %fd294;
- mov.b64 {%r459,%r460}, %fd295;
-
- mov.u32 %r472, 2;
- shfl.sync.bfly.b32 %r462|%p151, %r460, %r472, %r467, %r469;
- shfl.sync.bfly.b32 %r461|%p152, %r459, %r472, %r467, %r469;
-
- mov.b64 %fd296, {%r461,%r462};
+ mov.b64 {%r458,%r459}, %fd295;
+
+ mov.u32 %r471, 2;
+ shfl.sync.bfly.b32 %r461|%p151, %r459, %r471, %r466, %r468;
+ shfl.sync.bfly.b32 %r460|%p152, %r458, %r471, %r466, %r468;
+
+ mov.b64 %fd296, {%r460,%r461};
add.f64 %fd297, %fd295, %fd296;
- mov.b64 {%r463,%r464}, %fd297;
-
- mov.u32 %r473, 1;
- shfl.sync.bfly.b32 %r466|%p153, %r464, %r473, %r467, %r469;
- shfl.sync.bfly.b32 %r465|%p154, %r463, %r473, %r467, %r469;
-
- mov.b64 %fd298, {%r465,%r466};
+ mov.b64 {%r462,%r463}, %fd297;
+
+ mov.u32 %r472, 1;
+ shfl.sync.bfly.b32 %r465|%p153, %r463, %r472, %r466, %r468;
+ shfl.sync.bfly.b32 %r464|%p154, %r462, %r472, %r466, %r468;
+
+ mov.b64 %fd298, {%r464,%r465};
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
- mul.lo.s32 %r54, %r42, %r607;
- add.s32 %r474, %r41, %r54;
- setp.ge.s32 %p156, %r474, %r108;
+ mul.lo.s32 %r53, %r41, %r606;
+ add.s32 %r473, %r40, %r53;
+ setp.ge.s32 %p156, %r473, %r107;
@%p156 bra $L__BB0_109;
- add.s32 %r479, %r43, %r54;
- mul.wide.s32 %rd125, %r479, 8;
+ add.s32 %r478, %r42, %r53;
+ mul.wide.s32 %rd125, %r478, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
- mov.b64 {%r475, %r476}, %rd126;
+ mov.b64 {%r474, %r475}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
- mov.b64 {%r477, %r478}, %rd127;
-
- st.global.cs.v4.s32 [%rd124], {%r475,%r476,%r477,%r478};
+ mov.b64 {%r476, %r477}, %rd127;
+
+ st.global.cs.v4.s32 [%rd124], {%r474,%r475,%r476,%r477};
$L__BB0_109:
- add.s32 %r607, %r607, 1;
- setp.lt.s32 %p158, %r607, %r39;
+ add.s32 %r606, %r606, 1;
+ setp.lt.s32 %p158, %r606, %r38;
@%p158 bra $L__BB0_88;
- mad.lo.s32 %r56, %r108, %r6, %r40;
- shl.b32 %r57, %r36, 1;
- shl.b32 %r58, %r9, 1;
- mul.lo.s32 %r59, %r108, %r3;
- mov.u32 %r609, 0;
+ mad.lo.s32 %r55, %r107, %r6, %r39;
+ shl.b32 %r56, %r35, 1;
+ shl.b32 %r57, %r9, 1;
+ mul.lo.s32 %r58, %r107, %r3;
+ mov.u32 %r608, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
- mad.lo.s32 %r61, %r42, %r609, %r41;
- mad.lo.s32 %r482, %r58, %r609, %r57;
- mad.lo.s32 %r611, %r4, %r482, %r56;
- mov.u32 %r612, 0;
+ mad.lo.s32 %r60, %r41, %r608, %r40;
+ mad.lo.s32 %r481, %r57, %r608, %r56;
+ mad.lo.s32 %r610, %r4, %r481, %r55;
+ mov.u32 %r611, 0;
mov.f64 %fd304, 0d0000000000000000;
- mov.u32 %r610, %r6;
+ mov.u32 %r609, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
- setp.ge.s32 %p160, %r61, %r108;
+ setp.ge.s32 %p160, %r60, %r107;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
- setp.ge.s32 %p161, %r610, %r9;
+ setp.ge.s32 %p161, %r609, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
- mul.wide.s32 %rd129, %r611, 8;
+ mul.wide.s32 %rd129, %r610, 8;
add.s64 %rd128, %rd41, %rd129;
- ld.volatile.global.v4.s32 {%r483,%r484,%r485,%r486}, [%rd128];
-
- mov.b64 %rd130, {%r483, %r484};
+ ld.volatile.global.v4.s32 {%r482,%r483,%r484,%r485}, [%rd128];
+
+ mov.b64 %rd130, {%r482, %r483};
mov.b64 %fd397, %rd130;
- mov.b64 %rd131, {%r485, %r486};
+ mov.b64 %rd131, {%r484, %r485};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
- add.s32 %r611, %r611, %r59;
- add.s32 %r610, %r610, %r3;
- add.s32 %r612, %r612, 1;
- setp.lt.s32 %p162, %r612, %r47;
+ add.s32 %r610, %r610, %r58;
+ add.s32 %r609, %r609, %r3;
+ add.s32 %r611, %r611, 1;
+ setp.lt.s32 %p162, %r611, %r46;
@%p162 bra $L__BB0_113;
$L__BB0_117:
- mov.b64 {%r487,%r488}, %fd399;
-
- mov.u32 %r507, 31;
- mov.u32 %r508, 16;
- mov.u32 %r509, -1;
- shfl.sync.bfly.b32 %r490|%p163, %r488, %r508, %r507, %r509;
- shfl.sync.bfly.b32 %r489|%p164, %r487, %r508, %r507, %r509;
-
- mov.b64 %fd310, {%r489,%r490};
+ mov.b64 {%r486,%r487}, %fd399;
+
+ mov.u32 %r506, 31;
+ mov.u32 %r507, 16;
+ mov.u32 %r508, -1;
+ shfl.sync.bfly.b32 %r489|%p163, %r487, %r507, %r506, %r508;
+ shfl.sync.bfly.b32 %r488|%p164, %r486, %r507, %r506, %r508;
+
+ mov.b64 %fd310, {%r488,%r489};
add.f64 %fd311, %fd399, %fd310;
- mov.b64 {%r491,%r492}, %fd311;
-
- mov.u32 %r510, 8;
- shfl.sync.bfly.b32 %r494|%p165, %r492, %r510, %r507, %r509;
- shfl.sync.bfly.b32 %r493|%p166, %r491, %r510, %r507, %r509;
-
- mov.b64 %fd312, {%r493,%r494};
+ mov.b64 {%r490,%r491}, %fd311;
+
+ mov.u32 %r509, 8;
+ shfl.sync.bfly.b32 %r493|%p165, %r491, %r509, %r506, %r508;
+ shfl.sync.bfly.b32 %r492|%p166, %r490, %r509, %r506, %r508;
+
+ mov.b64 %fd312, {%r492,%r493};
add.f64 %fd313, %fd311, %fd312;
- mov.b64 {%r495,%r496}, %fd313;
-
- mov.u32 %r511, 4;
- shfl.sync.bfly.b32 %r498|%p167, %r496, %r511, %r507, %r509;
- shfl.sync.bfly.b32 %r497|%p168, %r495, %r511, %r507, %r509;
-
- mov.b64 %fd314, {%r497,%r498};
+ mov.b64 {%r494,%r495}, %fd313;
+
+ mov.u32 %r510, 4;
+ shfl.sync.bfly.b32 %r497|%p167, %r495, %r510, %r506, %r508;
+ shfl.sync.bfly.b32 %r496|%p168, %r494, %r510, %r506, %r508;
+
+ mov.b64 %fd314, {%r496,%r497};
add.f64 %fd315, %fd313, %fd314;
- mov.b64 {%r499,%r500}, %fd315;
-
- mov.u32 %r512, 2;
- shfl.sync.bfly.b32 %r502|%p169, %r500, %r512, %r507, %r509;
- shfl.sync.bfly.b32 %r501|%p170, %r499, %r512, %r507, %r509;
-
- mov.b64 %fd316, {%r501,%r502};
+ mov.b64 {%r498,%r499}, %fd315;
+
+ mov.u32 %r511, 2;
+ shfl.sync.bfly.b32 %r501|%p169, %r499, %r511, %r506, %r508;
+ shfl.sync.bfly.b32 %r500|%p170, %r498, %r511, %r506, %r508;
+
+ mov.b64 %fd316, {%r500,%r501};
add.f64 %fd317, %fd315, %fd316;
- mov.b64 {%r503,%r504}, %fd317;
-
- mov.u32 %r513, 1;
- shfl.sync.bfly.b32 %r506|%p171, %r504, %r513, %r507, %r509;
- shfl.sync.bfly.b32 %r505|%p172, %r503, %r513, %r507, %r509;
-
- mov.b64 %fd318, {%r505,%r506};
+ mov.b64 {%r502,%r503}, %fd317;
+
+ mov.u32 %r512, 1;
+ shfl.sync.bfly.b32 %r505|%p171, %r503, %r512, %r506, %r508;
+ shfl.sync.bfly.b32 %r504|%p172, %r502, %r512, %r506, %r508;
+
+ mov.b64 %fd318, {%r504,%r505};
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
@@ -1375,124 +1374,124 @@
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
- setp.ge.u32 %p175, %r46, %r44;
+ setp.ge.u32 %p175, %r45, %r43;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
- mov.b64 {%r514,%r515}, %fd400;
-
- mov.u32 %r534, 31;
- mov.u32 %r535, 16;
- mov.u32 %r536, -1;
- shfl.sync.bfly.b32 %r517|%p176, %r515, %r535, %r534, %r536;
- shfl.sync.bfly.b32 %r516|%p177, %r514, %r535, %r534, %r536;
-
- mov.b64 %fd321, {%r516,%r517};
+ mov.b64 {%r513,%r514}, %fd400;
+
+ mov.u32 %r533, 31;
+ mov.u32 %r534, 16;
+ mov.u32 %r535, -1;
+ shfl.sync.bfly.b32 %r516|%p176, %r514, %r534, %r533, %r535;
+ shfl.sync.bfly.b32 %r515|%p177, %r513, %r534, %r533, %r535;
+
+ mov.b64 %fd321, {%r515,%r516};
add.f64 %fd322, %fd400, %fd321;
- mov.b64 {%r518,%r519}, %fd322;
-
- mov.u32 %r537, 8;
- shfl.sync.bfly.b32 %r521|%p178, %r519, %r537, %r534, %r536;
- shfl.sync.bfly.b32 %r520|%p179, %r518, %r537, %r534, %r536;
-
- mov.b64 %fd323, {%r520,%r521};
+ mov.b64 {%r517,%r518}, %fd322;
+
+ mov.u32 %r536, 8;
+ shfl.sync.bfly.b32 %r520|%p178, %r518, %r536, %r533, %r535;
+ shfl.sync.bfly.b32 %r519|%p179, %r517, %r536, %r533, %r535;
+
+ mov.b64 %fd323, {%r519,%r520};
add.f64 %fd324, %fd322, %fd323;
- mov.b64 {%r522,%r523}, %fd324;
-
- mov.u32 %r538, 4;
- shfl.sync.bfly.b32 %r525|%p180, %r523, %r538, %r534, %r536;
- shfl.sync.bfly.b32 %r524|%p181, %r522, %r538, %r534, %r536;
-
- mov.b64 %fd325, {%r524,%r525};
+ mov.b64 {%r521,%r522}, %fd324;
+
+ mov.u32 %r537, 4;
+ shfl.sync.bfly.b32 %r524|%p180, %r522, %r537, %r533, %r535;
+ shfl.sync.bfly.b32 %r523|%p181, %r521, %r537, %r533, %r535;
+
+ mov.b64 %fd325, {%r523,%r524};
add.f64 %fd326, %fd324, %fd325;
- mov.b64 {%r526,%r527}, %fd326;
-
- mov.u32 %r539, 2;
- shfl.sync.bfly.b32 %r529|%p182, %r527, %r539, %r534, %r536;
- shfl.sync.bfly.b32 %r528|%p183, %r526, %r539, %r534, %r536;
-
- mov.b64 %fd327, {%r528,%r529};
+ mov.b64 {%r525,%r526}, %fd326;
+
+ mov.u32 %r538, 2;
+ shfl.sync.bfly.b32 %r528|%p182, %r526, %r538, %r533, %r535;
+ shfl.sync.bfly.b32 %r527|%p183, %r525, %r538, %r533, %r535;
+
+ mov.b64 %fd327, {%r527,%r528};
add.f64 %fd328, %fd326, %fd327;
- mov.b64 {%r530,%r531}, %fd328;
-
- mov.u32 %r540, 1;
- shfl.sync.bfly.b32 %r533|%p184, %r531, %r540, %r534, %r536;
- shfl.sync.bfly.b32 %r532|%p185, %r530, %r540, %r534, %r536;
-
- mov.b64 %fd329, {%r532,%r533};
+ mov.b64 {%r529,%r530}, %fd328;
+
+ mov.u32 %r539, 1;
+ shfl.sync.bfly.b32 %r532|%p184, %r530, %r539, %r533, %r535;
+ shfl.sync.bfly.b32 %r531|%p185, %r529, %r539, %r533, %r535;
+
+ mov.b64 %fd329, {%r531,%r532};
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
- mov.b64 {%r541,%r542}, %fd398;
-
- mov.u32 %r561, 31;
- mov.u32 %r562, 16;
- mov.u32 %r563, -1;
- shfl.sync.bfly.b32 %r544|%p188, %r542, %r562, %r561, %r563;
- shfl.sync.bfly.b32 %r543|%p189, %r541, %r562, %r561, %r563;
-
- mov.b64 %fd331, {%r543,%r544};
+ mov.b64 {%r540,%r541}, %fd398;
+
+ mov.u32 %r560, 31;
+ mov.u32 %r561, 16;
+ mov.u32 %r562, -1;
+ shfl.sync.bfly.b32 %r543|%p188, %r541, %r561, %r560, %r562;
+ shfl.sync.bfly.b32 %r542|%p189, %r540, %r561, %r560, %r562;
+
+ mov.b64 %fd331, {%r542,%r543};
add.f64 %fd332, %fd398, %fd331;
- mov.b64 {%r545,%r546}, %fd332;
-
- mov.u32 %r564, 8;
- shfl.sync.bfly.b32 %r548|%p190, %r546, %r564, %r561, %r563;
- shfl.sync.bfly.b32 %r547|%p191, %r545, %r564, %r561, %r563;
-
- mov.b64 %fd333, {%r547,%r548};
+ mov.b64 {%r544,%r545}, %fd332;
+
+ mov.u32 %r563, 8;
+ shfl.sync.bfly.b32 %r547|%p190, %r545, %r563, %r560, %r562;
+ shfl.sync.bfly.b32 %r546|%p191, %r544, %r563, %r560, %r562;
+
+ mov.b64 %fd333, {%r546,%r547};
add.f64 %fd334, %fd332, %fd333;
- mov.b64 {%r549,%r550}, %fd334;
-
- mov.u32 %r565, 4;
- shfl.sync.bfly.b32 %r552|%p192, %r550, %r565, %r561, %r563;
- shfl.sync.bfly.b32 %r551|%p193, %r549, %r565, %r561, %r563;
-
- mov.b64 %fd335, {%r551,%r552};
+ mov.b64 {%r548,%r549}, %fd334;
+
+ mov.u32 %r564, 4;
+ shfl.sync.bfly.b32 %r551|%p192, %r549, %r564, %r560, %r562;
+ shfl.sync.bfly.b32 %r550|%p193, %r548, %r564, %r560, %r562;
+
+ mov.b64 %fd335, {%r550,%r551};
add.f64 %fd336, %fd334, %fd335;
- mov.b64 {%r553,%r554}, %fd336;
-
- mov.u32 %r566, 2;
- shfl.sync.bfly.b32 %r556|%p194, %r554, %r566, %r561, %r563;
- shfl.sync.bfly.b32 %r555|%p195, %r553, %r566, %r561, %r563;
-
- mov.b64 %fd337, {%r555,%r556};
+ mov.b64 {%r552,%r553}, %fd336;
+
+ mov.u32 %r565, 2;
+ shfl.sync.bfly.b32 %r555|%p194, %r553, %r565, %r560, %r562;
+ shfl.sync.bfly.b32 %r554|%p195, %r552, %r565, %r560, %r562;
+
+ mov.b64 %fd337, {%r554,%r555};
add.f64 %fd338, %fd336, %fd337;
- mov.b64 {%r557,%r558}, %fd338;
-
- mov.u32 %r567, 1;
- shfl.sync.bfly.b32 %r560|%p196, %r558, %r567, %r561, %r563;
- shfl.sync.bfly.b32 %r559|%p197, %r557, %r567, %r561, %r563;
-
- mov.b64 %fd339, {%r559,%r560};
+ mov.b64 {%r556,%r557}, %fd338;
+
+ mov.u32 %r566, 1;
+ shfl.sync.bfly.b32 %r559|%p196, %r557, %r566, %r560, %r562;
+ shfl.sync.bfly.b32 %r558|%p197, %r556, %r566, %r560, %r562;
+
+ mov.b64 %fd339, {%r558,%r559};
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
@@ -1500,95 +1499,95 @@
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
- setp.ge.u32 %p199, %r46, %r44;
+ setp.ge.u32 %p199, %r45, %r43;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
- mov.b64 {%r568,%r569}, %fd402;
-
- mov.u32 %r588, 31;
- mov.u32 %r589, 16;
- mov.u32 %r590, -1;
- shfl.sync.bfly.b32 %r571|%p200, %r569, %r589, %r588, %r590;
- shfl.sync.bfly.b32 %r570|%p201, %r568, %r589, %r588, %r590;
-
- mov.b64 %fd343, {%r570,%r571};
+ mov.b64 {%r567,%r568}, %fd402;
+
+ mov.u32 %r587, 31;
+ mov.u32 %r588, 16;
+ mov.u32 %r589, -1;
+ shfl.sync.bfly.b32 %r570|%p200, %r568, %r588, %r587, %r589;
+ shfl.sync.bfly.b32 %r569|%p201, %r567, %r588, %r587, %r589;
+
+ mov.b64 %fd343, {%r569,%r570};
add.f64 %fd344, %fd402, %fd343;
- mov.b64 {%r572,%r573}, %fd344;
-
- mov.u32 %r591, 8;
- shfl.sync.bfly.b32 %r575|%p202, %r573, %r591, %r588, %r590;
- shfl.sync.bfly.b32 %r574|%p203, %r572, %r591, %r588, %r590;
-
- mov.b64 %fd345, {%r574,%r575};
+ mov.b64 {%r571,%r572}, %fd344;
+
+ mov.u32 %r590, 8;
+ shfl.sync.bfly.b32 %r574|%p202, %r572, %r590, %r587, %r589;
+ shfl.sync.bfly.b32 %r573|%p203, %r571, %r590, %r587, %r589;
+
+ mov.b64 %fd345, {%r573,%r574};
add.f64 %fd346, %fd344, %fd345;
- mov.b64 {%r576,%r577}, %fd346;
-
- mov.u32 %r592, 4;
- shfl.sync.bfly.b32 %r579|%p204, %r577, %r592, %r588, %r590;
- shfl.sync.bfly.b32 %r578|%p205, %r576, %r592, %r588, %r590;
-
- mov.b64 %fd347, {%r578,%r579};
+ mov.b64 {%r575,%r576}, %fd346;
+
+ mov.u32 %r591, 4;
+ shfl.sync.bfly.b32 %r578|%p204, %r576, %r591, %r587, %r589;
+ shfl.sync.bfly.b32 %r577|%p205, %r575, %r591, %r587, %r589;
+
+ mov.b64 %fd347, {%r577,%r578};
add.f64 %fd348, %fd346, %fd347;
- mov.b64 {%r580,%r581}, %fd348;
-
- mov.u32 %r593, 2;
- shfl.sync.bfly.b32 %r583|%p206, %r581, %r593, %r588, %r590;
- shfl.sync.bfly.b32 %r582|%p207, %r580, %r593, %r588, %r590;
-
- mov.b64 %fd349, {%r582,%r583};
+ mov.b64 {%r579,%r580}, %fd348;
+
+ mov.u32 %r592, 2;
+ shfl.sync.bfly.b32 %r582|%p206, %r580, %r592, %r587, %r589;
+ shfl.sync.bfly.b32 %r581|%p207, %r579, %r592, %r587, %r589;
+
+ mov.b64 %fd349, {%r581,%r582};
add.f64 %fd350, %fd348, %fd349;
- mov.b64 {%r584,%r585}, %fd350;
-
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r587|%p208, %r585, %r594, %r588, %r590;
- shfl.sync.bfly.b32 %r586|%p209, %r584, %r594, %r588, %r590;
-
- mov.b64 %fd351, {%r586,%r587};
+ mov.b64 {%r583,%r584}, %fd350;
+
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r586|%p208, %r584, %r593, %r587, %r589;
+ shfl.sync.bfly.b32 %r585|%p209, %r583, %r593, %r587, %r589;
+
+ mov.b64 %fd351, {%r585,%r586};
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
- mul.lo.s32 %r69, %r42, %r609;
- add.s32 %r595, %r41, %r69;
- setp.ge.s32 %p211, %r595, %r108;
+ mul.lo.s32 %r68, %r41, %r608;
+ add.s32 %r594, %r40, %r68;
+ setp.ge.s32 %p211, %r594, %r107;
@%p211 bra $L__BB0_132;
- add.s32 %r600, %r43, %r69;
- mul.wide.s32 %rd133, %r600, 8;
+ add.s32 %r599, %r42, %r68;
+ mul.wide.s32 %rd133, %r599, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
- mov.b64 {%r596, %r597}, %rd134;
+ mov.b64 {%r595, %r596}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
- mov.b64 {%r598, %r599}, %rd135;
-
- st.global.cs.v4.s32 [%rd132], {%r596,%r597,%r598,%r599};
+ mov.b64 {%r597, %r598}, %rd135;
+
+ st.global.cs.v4.s32 [%rd132], {%r595,%r596,%r597,%r598};
$L__BB0_132:
- add.s32 %r609, %r609, 1;
- setp.lt.s32 %p213, %r609, %r39;
+ add.s32 %r608, %r608, 1;
+ setp.lt.s32 %p213, %r608, %r38;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
4: CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_768
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 72
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103395arrayE[];
.entry _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
.reg .b32 %r<613>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
ld.param.v2.u32 {%r107, %r108}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r117, %r118}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r121, %r122}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103399nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r143, %r108, 1;
shr.u32 %r144, %r143, 31;
add.s32 %r145, %r143, %r144;
shr.s32 %r2, %r145, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r146, %r2, %r3;
add.s32 %r147, %r146, 31;
shr.s32 %r148, %r147, 31;
shr.u32 %r149, %r148, 27;
add.s32 %r150, %r147, %r149;
shr.u32 %r151, %r150, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r152, %r4, %r151;
shl.b32 %r153, %r152, 8;
cvt.u64.u32 %rd1, %r153;
mul.lo.s32 %r154, %r4, %r2;
shl.b32 %r155, %r154, 4;
or.b32 %r156, %r155, 15;
and.b32 %r5, %r156, -16;
add.s32 %r157, %r156, %r5;
and.b32 %r158, %r157, -16;
cvt.s64.s32 %rd2, %r158;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_7c8b8ff7_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
cvt.rn.f64.s32 %fd1, %r108;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r159, %r7, 1;
setp.lt.s32 %p7, %r159, %r108;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r160, smem_ptr; }
// end inline asm
shl.b32 %r163, %r6, 4;
add.s32 %r161, %r160, %r163;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r162, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r162, 0;
cp.async.ca.shared.global [%r161], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r164, %r4, 215;
div.s32 %r165, %r164, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r166, %r9, %r165;
add.s32 %r167, %r166, -1;
div.s32 %r10, %r167, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r169, %ctaid.y;
mul.lo.s32 %r170, %r10, %r4;
mul.lo.s32 %r11, %r170, %r169;
shl.b32 %r171, %r8, 3;
shl.b32 %r172, %r6, 4;
mad.lo.s32 %r12, %r171, %r108, %r172;
mul.lo.s32 %r173, %r108, %r8;
cvt.s64.s32 %rd53, %r173;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r174, %r11, %r108;
cvt.s64.s32 %rd6, %r174;
mul.lo.s32 %r13, %r108, %r4;
mul.lo.s32 %r14, %r10, %r169;
add.s32 %r15, %r173, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r177, %r176, %r16;
shr.u32 %r17, %r6, 5;
add.s32 %r178, %r177, %r17;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r18, %r6, 31;
add.s32 %r179, %r177, %r18;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r176, 8;
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r601, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
// end inline asm
add.s32 %r183, %r12, %r182;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
// end inline asm
add.s32 %r193, %r12, %r192;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r180, %r601, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r185, %r13, %r601;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r184, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r184, 0;
cp.async.ca.shared.global [%r183], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r186, %r14, %r601;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd364, 0d0000000000000000;
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r188, %r14, %r601;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
add.s32 %r190, %r14, %r601;
mad.lo.s32 %r22, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
setp.gt.s32 %p16, %r22, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
mul.lo.s32 %r191, %r22, %r117;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
setp.lt.s32 %p17, %r22, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
mul.lo.s32 %r195, %r13, %r601;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r194, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r194, 0;
cp.async.ca.shared.global [%r193], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
mul.lo.s32 %r196, %r22, %r121;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd371, %fd370;
@%p18 bra $L__BB0_22;
ld.shared.v2.f64 {%fd118, %fd119}, [%rd7];
ld.shared.v2.f64 {%fd122, %fd123}, [%rd9];
ld.shared.v2.f64 {%fd126, %fd127}, [%rd11];
mul.f64 %fd130, %fd126, %fd118;
add.f64 %fd131, %fd130, 0d0000000000000000;
sub.f64 %fd132, %fd122, %fd366;
mul.f64 %fd133, %fd367, %fd132;
fma.rn.f64 %fd134, %fd130, %fd133, 0d0000000000000000;
fma.rn.f64 %fd368, %fd133, %fd118, %fd368;
mul.f64 %fd135, %fd127, %fd119;
add.f64 %fd371, %fd131, %fd135;
sub.f64 %fd136, %fd123, %fd366;
mul.f64 %fd137, %fd367, %fd136;
fma.rn.f64 %fd370, %fd135, %fd137, %fd134;
fma.rn.f64 %fd369, %fd137, %fd119, %fd369;
$L__BB0_22:
// begin inline asm
mov.b64 {%r197,%r198}, %fd371;
// end inline asm
mov.u32 %r217, 31;
mov.u32 %r218, 16;
mov.u32 %r219, -1;
shfl.sync.bfly.b32 %r200|%p21, %r198, %r218, %r217, %r219;
shfl.sync.bfly.b32 %r199|%p22, %r197, %r218, %r217, %r219;
// begin inline asm
mov.b64 %fd139, {%r199,%r200};
// end inline asm
add.f64 %fd140, %fd371, %fd139;
// begin inline asm
mov.b64 {%r201,%r202}, %fd140;
// end inline asm
mov.u32 %r220, 8;
shfl.sync.bfly.b32 %r204|%p23, %r202, %r220, %r217, %r219;
shfl.sync.bfly.b32 %r203|%p24, %r201, %r220, %r217, %r219;
// begin inline asm
mov.b64 %fd141, {%r203,%r204};
// end inline asm
add.f64 %fd142, %fd140, %fd141;
// begin inline asm
mov.b64 {%r205,%r206}, %fd142;
// end inline asm
mov.u32 %r221, 4;
shfl.sync.bfly.b32 %r208|%p25, %r206, %r221, %r217, %r219;
shfl.sync.bfly.b32 %r207|%p26, %r205, %r221, %r217, %r219;
// begin inline asm
mov.b64 %fd143, {%r207,%r208};
// end inline asm
add.f64 %fd144, %fd142, %fd143;
// begin inline asm
mov.b64 {%r209,%r210}, %fd144;
// end inline asm
mov.u32 %r222, 2;
shfl.sync.bfly.b32 %r212|%p27, %r210, %r222, %r217, %r219;
shfl.sync.bfly.b32 %r211|%p28, %r209, %r222, %r217, %r219;
// begin inline asm
mov.b64 %fd145, {%r211,%r212};
// end inline asm
add.f64 %fd146, %fd144, %fd145;
// begin inline asm
mov.b64 {%r213,%r214}, %fd146;
// end inline asm
mov.u32 %r223, 1;
shfl.sync.bfly.b32 %r216|%p29, %r214, %r223, %r217, %r219;
shfl.sync.bfly.b32 %r215|%p30, %r213, %r223, %r217, %r219;
// begin inline asm
mov.b64 %fd147, {%r215,%r216};
// end inline asm
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
setp.ne.s32 %p31, %r18, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
setp.ne.s32 %p32, %r17, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
setp.ge.u32 %p33, %r18, %r16;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
$L__BB0_27:
// begin inline asm
mov.b64 {%r224,%r225}, %fd372;
// end inline asm
mov.u32 %r244, 31;
mov.u32 %r245, 16;
mov.u32 %r246, -1;
shfl.sync.bfly.b32 %r227|%p34, %r225, %r245, %r244, %r246;
shfl.sync.bfly.b32 %r226|%p35, %r224, %r245, %r244, %r246;
// begin inline asm
mov.b64 %fd150, {%r226,%r227};
// end inline asm
add.f64 %fd151, %fd372, %fd150;
// begin inline asm
mov.b64 {%r228,%r229}, %fd151;
// end inline asm
mov.u32 %r247, 8;
shfl.sync.bfly.b32 %r231|%p36, %r229, %r247, %r244, %r246;
shfl.sync.bfly.b32 %r230|%p37, %r228, %r247, %r244, %r246;
// begin inline asm
mov.b64 %fd152, {%r230,%r231};
// end inline asm
add.f64 %fd153, %fd151, %fd152;
// begin inline asm
mov.b64 {%r232,%r233}, %fd153;
// end inline asm
mov.u32 %r248, 4;
shfl.sync.bfly.b32 %r235|%p38, %r233, %r248, %r244, %r246;
shfl.sync.bfly.b32 %r234|%p39, %r232, %r248, %r244, %r246;
// begin inline asm
mov.b64 %fd154, {%r234,%r235};
// end inline asm
add.f64 %fd155, %fd153, %fd154;
// begin inline asm
mov.b64 {%r236,%r237}, %fd155;
// end inline asm
mov.u32 %r249, 2;
shfl.sync.bfly.b32 %r239|%p40, %r237, %r249, %r244, %r246;
shfl.sync.bfly.b32 %r238|%p41, %r236, %r249, %r244, %r246;
// begin inline asm
mov.b64 %fd156, {%r238,%r239};
// end inline asm
add.f64 %fd157, %fd155, %fd156;
// begin inline asm
mov.b64 {%r240,%r241}, %fd157;
// end inline asm
mov.u32 %r250, 1;
shfl.sync.bfly.b32 %r243|%p42, %r241, %r250, %r244, %r246;
shfl.sync.bfly.b32 %r242|%p43, %r240, %r250, %r244, %r246;
// begin inline asm
mov.b64 %fd158, {%r242,%r243};
// end inline asm
add.f64 %fd373, %fd157, %fd158;
$L__BB0_28:
bar.sync 0;
// begin inline asm
mov.b64 {%r251,%r252}, %fd370;
// end inline asm
mov.u32 %r271, 31;
mov.u32 %r272, 16;
mov.u32 %r273, -1;
shfl.sync.bfly.b32 %r254|%p44, %r252, %r272, %r271, %r273;
shfl.sync.bfly.b32 %r253|%p45, %r251, %r272, %r271, %r273;
// begin inline asm
mov.b64 %fd160, {%r253,%r254};
// end inline asm
add.f64 %fd161, %fd370, %fd160;
// begin inline asm
mov.b64 {%r255,%r256}, %fd161;
// end inline asm
mov.u32 %r274, 8;
shfl.sync.bfly.b32 %r258|%p46, %r256, %r274, %r271, %r273;
shfl.sync.bfly.b32 %r257|%p47, %r255, %r274, %r271, %r273;
// begin inline asm
mov.b64 %fd162, {%r257,%r258};
// end inline asm
add.f64 %fd163, %fd161, %fd162;
// begin inline asm
mov.b64 {%r259,%r260}, %fd163;
// end inline asm
mov.u32 %r275, 4;
shfl.sync.bfly.b32 %r262|%p48, %r260, %r275, %r271, %r273;
shfl.sync.bfly.b32 %r261|%p49, %r259, %r275, %r271, %r273;
// begin inline asm
mov.b64 %fd164, {%r261,%r262};
// end inline asm
add.f64 %fd165, %fd163, %fd164;
// begin inline asm
mov.b64 {%r263,%r264}, %fd165;
// end inline asm
mov.u32 %r276, 2;
shfl.sync.bfly.b32 %r266|%p50, %r264, %r276, %r271, %r273;
shfl.sync.bfly.b32 %r265|%p51, %r263, %r276, %r271, %r273;
// begin inline asm
mov.b64 %fd166, {%r265,%r266};
// end inline asm
add.f64 %fd167, %fd165, %fd166;
// begin inline asm
mov.b64 {%r267,%r268}, %fd167;
// end inline asm
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r270|%p52, %r268, %r277, %r271, %r273;
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
// begin inline asm
mov.b64 %fd168, {%r269,%r270};
// end inline asm
add.f64 %fd375, %fd167, %fd168;
setp.eq.s32 %p4, %r18, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
$L__BB0_30:
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
setp.ge.u32 %p56, %r18, %r16;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
$L__BB0_33:
// begin inline asm
mov.b64 {%r278,%r279}, %fd374;
// end inline asm
mov.u32 %r298, 31;
mov.u32 %r299, 16;
mov.u32 %r300, -1;
shfl.sync.bfly.b32 %r281|%p57, %r279, %r299, %r298, %r300;
shfl.sync.bfly.b32 %r280|%p58, %r278, %r299, %r298, %r300;
// begin inline asm
mov.b64 %fd172, {%r280,%r281};
// end inline asm
add.f64 %fd173, %fd374, %fd172;
// begin inline asm
mov.b64 {%r282,%r283}, %fd173;
// end inline asm
mov.u32 %r301, 8;
shfl.sync.bfly.b32 %r285|%p59, %r283, %r301, %r298, %r300;
shfl.sync.bfly.b32 %r284|%p60, %r282, %r301, %r298, %r300;
// begin inline asm
mov.b64 %fd174, {%r284,%r285};
// end inline asm
add.f64 %fd175, %fd173, %fd174;
// begin inline asm
mov.b64 {%r286,%r287}, %fd175;
// end inline asm
mov.u32 %r302, 4;
shfl.sync.bfly.b32 %r289|%p61, %r287, %r302, %r298, %r300;
shfl.sync.bfly.b32 %r288|%p62, %r286, %r302, %r298, %r300;
// begin inline asm
mov.b64 %fd176, {%r288,%r289};
// end inline asm
add.f64 %fd177, %fd175, %fd176;
// begin inline asm
mov.b64 {%r290,%r291}, %fd177;
// end inline asm
mov.u32 %r303, 2;
shfl.sync.bfly.b32 %r293|%p63, %r291, %r303, %r298, %r300;
shfl.sync.bfly.b32 %r292|%p64, %r290, %r303, %r298, %r300;
// begin inline asm
mov.b64 %fd178, {%r292,%r293};
// end inline asm
add.f64 %fd179, %fd177, %fd178;
// begin inline asm
mov.b64 {%r294,%r295}, %fd179;
// end inline asm
mov.u32 %r304, 1;
shfl.sync.bfly.b32 %r297|%p65, %r295, %r304, %r298, %r300;
shfl.sync.bfly.b32 %r296|%p66, %r294, %r304, %r298, %r300;
// begin inline asm
mov.b64 %fd180, {%r296,%r297};
// end inline asm
add.f64 %fd375, %fd179, %fd180;
$L__BB0_34:
bar.sync 0;
setp.ne.s32 %p67, %r6, 0;
@%p67 bra $L__BB0_36;
st.shared.f64 [%rd12], %fd33;
$L__BB0_36:
bar.sync 0;
ld.shared.f64 %fd38, [%rd12];
bar.sync 0;
@%p67 bra $L__BB0_38;
add.f64 %fd181, %fd375, 0d0000000000000000;
selp.f64 %fd182, %fd181, 0d0000000000000000, %p4;
st.shared.f64 [%rd12], %fd182;
$L__BB0_38:
bar.sync 0;
ld.shared.f64 %fd39, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_40;
mul.f64 %fd183, %fd2, %fd367;
ld.shared.v2.f64 {%fd184, %fd185}, [%rd9];
ld.shared.v2.f64 {%fd188, %fd189}, [%rd11];
ld.shared.v2.f64 {%fd192, %fd193}, [%rd7];
mul.f64 %fd196, %fd188, %fd192;
mul.f64 %fd197, %fd196, %fd1;
sub.f64 %fd198, %fd184, %fd366;
mul.f64 %fd199, %fd367, %fd198;
sub.f64 %fd200, %fd197, %fd38;
mul.f64 %fd201, %fd39, %fd199;
sub.f64 %fd202, %fd200, %fd201;
mul.f64 %fd203, %fd183, %fd202;
mov.b64 %rd80, %fd203;
mul.f64 %fd204, %fd189, %fd193;
mul.f64 %fd205, %fd204, %fd1;
sub.f64 %fd206, %fd185, %fd366;
mul.f64 %fd207, %fd367, %fd206;
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
mad.lo.s32 %r309, %r601, %r4, %r11;
mad.lo.s32 %r310, %r309, %r108, %r15;
mul.wide.s32 %rd82, %r310, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
// end inline asm
$L__BB0_40:
add.s32 %r601, %r601, 1;
setp.lt.s32 %p71, %r601, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
mov.u32 %r311, %tid.z;
mad.lo.s32 %r24, %r311, %r4, %r8;
mad.lo.s32 %r25, %r24, %r3, %r6;
mul.wide.u32 %rd83, %r25, 8;
add.s64 %rd22, %rd44, %rd83;
clz.b32 %r312, %r4;
mov.u32 %r313, 31;
sub.s32 %r314, %r313, %r312;
mov.u32 %r315, 1;
shl.b32 %r26, %r315, %r314;
setp.lt.u32 %p72, %r8, %r26;
add.s32 %r316, %r26, %r8;
setp.lt.u32 %p73, %r316, %r4;
and.pred %p5, %p72, %p73;
shl.b32 %r317, %r3, %r314;
add.s32 %r318, %r25, %r317;
mul.wide.s32 %rd85, %r318, 8;
add.s64 %rd23, %rd44, %rd85;
shr.u32 %r319, %r26, 31;
add.s32 %r320, %r26, %r319;
shr.s32 %r605, %r320, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
ld.shared.f64 %fd212, [%rd23];
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
setp.lt.s32 %p75, %r26, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
mov.u32 %r602, %r605;
$L__BB0_45:
setp.ge.u32 %p76, %r8, %r602;
@%p76 bra $L__BB0_47;
mad.lo.s32 %r321, %r602, %r3, %r25;
mul.wide.s32 %rd86, %r321, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
shr.u32 %r29, %r602, 1;
setp.gt.u32 %p77, %r602, 3;
mov.u32 %r602, %r29;
@%p77 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r322, %r25, %r3;
mul.wide.u32 %rd89, %r322, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f64 %fd219, [%rd22];
add.f64 %fd380, %fd219, 0d0000000000000000;
@%p79 bra $L__BB0_51;
ld.shared.f64 %fd220, [%rd24];
add.f64 %fd380, %fd380, %fd220;
$L__BB0_51:
bar.sync 0;
st.shared.f64 [%rd22], %fd369;
bar.sync 0;
@%p74 bra $L__BB0_53;
ld.shared.f64 %fd221, [%rd23];
ld.shared.f64 %fd222, [%rd22];
add.f64 %fd223, %fd221, %fd222;
st.shared.f64 [%rd22], %fd223;
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
mov.u32 %r603, %r605;
$L__BB0_55:
setp.ge.u32 %p82, %r8, %r603;
@%p82 bra $L__BB0_57;
mad.lo.s32 %r323, %r603, %r3, %r25;
mul.wide.s32 %rd91, %r323, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
shr.u32 %r31, %r603, 1;
setp.gt.u32 %p83, %r603, 3;
mov.u32 %r603, %r31;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f64 %fd228, [%rd22];
add.f64 %fd381, %fd228, 0d0000000000000000;
@%p85 bra $L__BB0_61;
ld.shared.f64 %fd229, [%rd24];
add.f64 %fd381, %fd381, %fd229;
$L__BB0_61:
bar.sync 0;
st.shared.f64 [%rd22], %fd378;
bar.sync 0;
@%p74 bra $L__BB0_63;
ld.shared.f64 %fd230, [%rd23];
ld.shared.f64 %fd231, [%rd22];
add.f64 %fd232, %fd230, %fd231;
st.shared.f64 [%rd22], %fd232;
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
mov.u32 %r604, %r605;
$L__BB0_65:
setp.ge.u32 %p88, %r8, %r604;
@%p88 bra $L__BB0_67;
mad.lo.s32 %r324, %r604, %r3, %r25;
mul.wide.s32 %rd94, %r324, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
shr.u32 %r33, %r604, 1;
setp.gt.u32 %p89, %r604, 3;
mov.u32 %r604, %r33;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f64 %fd237, [%rd22];
add.f64 %fd382, %fd237, 0d0000000000000000;
@%p91 bra $L__BB0_71;
ld.shared.f64 %fd238, [%rd24];
add.f64 %fd382, %fd382, %fd238;
$L__BB0_71:
bar.sync 0;
st.shared.f64 [%rd22], %fd379;
bar.sync 0;
@%p74 bra $L__BB0_73;
ld.shared.f64 %fd239, [%rd23];
ld.shared.f64 %fd240, [%rd22];
add.f64 %fd241, %fd239, %fd240;
st.shared.f64 [%rd22], %fd241;
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
setp.ge.u32 %p94, %r8, %r605;
@%p94 bra $L__BB0_76;
mad.lo.s32 %r325, %r605, %r3, %r25;
mul.wide.s32 %rd97, %r325, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
shr.u32 %r35, %r605, 1;
setp.gt.u32 %p95, %r605, 3;
mov.u32 %r605, %r35;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
setp.lt.u32 %p97, %r4, 2;
ld.shared.f64 %fd246, [%rd22];
add.f64 %fd383, %fd246, 0d0000000000000000;
@%p97 bra $L__BB0_80;
ld.shared.f64 %fd247, [%rd24];
add.f64 %fd383, %fd383, %fd247;
$L__BB0_80:
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
mov.u32 %r334, %ctaid.y;
mad.lo.s32 %r335, %r108, %r334, %r7;
mul.wide.s32 %rd102, %r335, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
mov.b64 {%r326, %r327}, %rd103;
mov.b64 %rd104, %fd381;
mov.b64 {%r328, %r329}, %rd104;
// begin inline asm
st.volatile.global.v4.s32 [%rd100], {%r326,%r327,%r328,%r329};
// end inline asm
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
mov.b64 {%r330, %r331}, %rd105;
mov.b64 %rd106, %fd383;
mov.b64 {%r332, %r333}, %rd106;
// begin inline asm
st.volatile.global.v4.s32 [%rd101], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_82:
mov.u32 %r36, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r336, %r6, %r8;
or.b32 %r338, %r336, %r311;
setp.ne.s32 %p98, %r338, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
mov.u32 %r339, %ctaid.x;
mov.u32 %r340, %ctaid.z;
mov.u32 %r341, %nctaid.x;
mad.lo.s32 %r342, %r340, %r341, %r339;
mul.wide.s32 %rd108, %r342, 8;
add.s64 %rd27, %rd107, %rd108;
add.s32 %r343, %r9, -1;
setp.eq.s32 %p99, %r36, %r343;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
mov.u32 %r606, 8;
$L__BB0_85:
// begin inline asm
nanosleep.u32 %r606;
// end inline asm
setp.lt.u32 %p101, %r606, 256;
selp.u32 %r346, 1, 0, %p101;
shl.b32 %r606, %r606, %r346;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
add.s32 %r347, %r4, %r2;
add.s32 %r348, %r347, -1;
div.s32 %r349, %r348, %r4;
add.s32 %r350, %r9, %r349;
add.s32 %r351, %r350, -1;
div.s32 %r39, %r351, %r9;
setp.lt.s32 %p103, %r39, 1;
@%p103 bra $L__BB0_133;
add.s32 %r353, %r9, %r3;
add.s32 %r354, %r353, -1;
shl.b32 %r40, %r8, 1;
shl.b32 %r355, %r4, 1;
mad.lo.s32 %r43, %r355, %r36, %r40;
or.b32 %r41, %r43, 1;
mul.lo.s32 %r42, %r355, %r9;
shr.u32 %r44, %r3, 5;
mul.lo.s32 %r356, %r24, %r44;
shr.u32 %r45, %r6, 5;
add.s32 %r357, %r356, %r45;
mul.wide.u32 %rd117, %r357, 8;
add.s64 %rd29, %rd44, %rd117;
and.b32 %r46, %r6, 31;
add.s32 %r358, %r356, %r46;
mul.wide.u32 %rd119, %r358, 8;
add.s64 %rd30, %rd44, %rd119;
div.s32 %r47, %r354, %r3;
mov.u32 %r607, 0;
$L__BB0_88:
.pragma "nounroll";
setp.lt.s32 %p104, %r47, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
mul.lo.s32 %r360, %r42, %r607;
add.s32 %r49, %r41, %r360;
add.s32 %r50, %r43, %r360;
mov.u32 %r608, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
setp.ge.s32 %p105, %r49, %r108;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
mad.lo.s32 %r52, %r608, %r3, %r6;
setp.ge.s32 %p106, %r52, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
mad.lo.s32 %r365, %r52, %r108, %r50;
mul.wide.s32 %rd121, %r365, 8;
add.s64 %rd120, %rd42, %rd121;
// begin inline asm
ld.volatile.global.v4.s32 {%r361,%r362,%r363,%r364}, [%rd120];
// end inline asm
mov.b64 %rd122, {%r361, %r362};
mov.b64 %fd387, %rd122;
mov.b64 %rd123, {%r363, %r364};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
add.s32 %r608, %r608, 1;
setp.lt.s32 %p107, %r608, %r47;
@%p107 bra $L__BB0_90;
$L__BB0_94:
// begin inline asm
mov.b64 {%r366,%r367}, %fd389;
// end inline asm
mov.u32 %r386, 31;
mov.u32 %r387, 16;
mov.u32 %r388, -1;
shfl.sync.bfly.b32 %r369|%p108, %r367, %r387, %r386, %r388;
shfl.sync.bfly.b32 %r368|%p109, %r366, %r387, %r386, %r388;
// begin inline asm
mov.b64 %fd257, {%r368,%r369};
// end inline asm
add.f64 %fd258, %fd389, %fd257;
// begin inline asm
mov.b64 {%r370,%r371}, %fd258;
// end inline asm
mov.u32 %r389, 8;
shfl.sync.bfly.b32 %r373|%p110, %r371, %r389, %r386, %r388;
shfl.sync.bfly.b32 %r372|%p111, %r370, %r389, %r386, %r388;
// begin inline asm
mov.b64 %fd259, {%r372,%r373};
// end inline asm
add.f64 %fd260, %fd258, %fd259;
// begin inline asm
mov.b64 {%r374,%r375}, %fd260;
// end inline asm
mov.u32 %r390, 4;
shfl.sync.bfly.b32 %r377|%p112, %r375, %r390, %r386, %r388;
shfl.sync.bfly.b32 %r376|%p113, %r374, %r390, %r386, %r388;
// begin inline asm
mov.b64 %fd261, {%r376,%r377};
// end inline asm
add.f64 %fd262, %fd260, %fd261;
// begin inline asm
mov.b64 {%r378,%r379}, %fd262;
// end inline asm
mov.u32 %r391, 2;
shfl.sync.bfly.b32 %r381|%p114, %r379, %r391, %r386, %r388;
shfl.sync.bfly.b32 %r380|%p115, %r378, %r391, %r386, %r388;
// begin inline asm
mov.b64 %fd263, {%r380,%r381};
// end inline asm
add.f64 %fd264, %fd262, %fd263;
// begin inline asm
mov.b64 {%r382,%r383}, %fd264;
// end inline asm
mov.u32 %r392, 1;
shfl.sync.bfly.b32 %r385|%p116, %r383, %r392, %r386, %r388;
shfl.sync.bfly.b32 %r384|%p117, %r382, %r392, %r386, %r388;
// begin inline asm
mov.b64 %fd265, {%r384,%r385};
// end inline asm
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
setp.ne.s32 %p118, %r46, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
setp.ne.s32 %p119, %r45, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
setp.ge.u32 %p120, %r46, %r44;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
// begin inline asm
mov.b64 {%r393,%r394}, %fd390;
// end inline asm
mov.u32 %r413, 31;
mov.u32 %r414, 16;
mov.u32 %r415, -1;
shfl.sync.bfly.b32 %r396|%p121, %r394, %r414, %r413, %r415;
shfl.sync.bfly.b32 %r395|%p122, %r393, %r414, %r413, %r415;
// begin inline asm
mov.b64 %fd268, {%r395,%r396};
// end inline asm
add.f64 %fd269, %fd390, %fd268;
// begin inline asm
mov.b64 {%r397,%r398}, %fd269;
// end inline asm
mov.u32 %r416, 8;
shfl.sync.bfly.b32 %r400|%p123, %r398, %r416, %r413, %r415;
shfl.sync.bfly.b32 %r399|%p124, %r397, %r416, %r413, %r415;
// begin inline asm
mov.b64 %fd270, {%r399,%r400};
// end inline asm
add.f64 %fd271, %fd269, %fd270;
// begin inline asm
mov.b64 {%r401,%r402}, %fd271;
// end inline asm
mov.u32 %r417, 4;
shfl.sync.bfly.b32 %r404|%p125, %r402, %r417, %r413, %r415;
shfl.sync.bfly.b32 %r403|%p126, %r401, %r417, %r413, %r415;
// begin inline asm
mov.b64 %fd272, {%r403,%r404};
// end inline asm
add.f64 %fd273, %fd271, %fd272;
// begin inline asm
mov.b64 {%r405,%r406}, %fd273;
// end inline asm
mov.u32 %r418, 2;
shfl.sync.bfly.b32 %r408|%p127, %r406, %r418, %r413, %r415;
shfl.sync.bfly.b32 %r407|%p128, %r405, %r418, %r413, %r415;
// begin inline asm
mov.b64 %fd274, {%r407,%r408};
// end inline asm
add.f64 %fd275, %fd273, %fd274;
// begin inline asm
mov.b64 {%r409,%r410}, %fd275;
// end inline asm
mov.u32 %r419, 1;
shfl.sync.bfly.b32 %r412|%p129, %r410, %r419, %r413, %r415;
shfl.sync.bfly.b32 %r411|%p130, %r409, %r419, %r413, %r415;
// begin inline asm
mov.b64 %fd276, {%r411,%r412};
// end inline asm
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
setp.eq.s32 %p132, %r46, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r420,%r421}, %fd388;
// end inline asm
mov.u32 %r440, 31;
mov.u32 %r441, 16;
mov.u32 %r442, -1;
shfl.sync.bfly.b32 %r423|%p133, %r421, %r441, %r440, %r442;
shfl.sync.bfly.b32 %r422|%p134, %r420, %r441, %r440, %r442;
// begin inline asm
mov.b64 %fd278, {%r422,%r423};
// end inline asm
add.f64 %fd279, %fd388, %fd278;
// begin inline asm
mov.b64 {%r424,%r425}, %fd279;
// end inline asm
mov.u32 %r443, 8;
shfl.sync.bfly.b32 %r427|%p135, %r425, %r443, %r440, %r442;
shfl.sync.bfly.b32 %r426|%p136, %r424, %r443, %r440, %r442;
// begin inline asm
mov.b64 %fd280, {%r426,%r427};
// end inline asm
add.f64 %fd281, %fd279, %fd280;
// begin inline asm
mov.b64 {%r428,%r429}, %fd281;
// end inline asm
mov.u32 %r444, 4;
shfl.sync.bfly.b32 %r431|%p137, %r429, %r444, %r440, %r442;
shfl.sync.bfly.b32 %r430|%p138, %r428, %r444, %r440, %r442;
// begin inline asm
mov.b64 %fd282, {%r430,%r431};
// end inline asm
add.f64 %fd283, %fd281, %fd282;
// begin inline asm
mov.b64 {%r432,%r433}, %fd283;
// end inline asm
mov.u32 %r445, 2;
shfl.sync.bfly.b32 %r435|%p139, %r433, %r445, %r440, %r442;
shfl.sync.bfly.b32 %r434|%p140, %r432, %r445, %r440, %r442;
// begin inline asm
mov.b64 %fd284, {%r434,%r435};
// end inline asm
add.f64 %fd285, %fd283, %fd284;
// begin inline asm
mov.b64 {%r436,%r437}, %fd285;
// end inline asm
mov.u32 %r446, 1;
shfl.sync.bfly.b32 %r439|%p141, %r437, %r446, %r440, %r442;
shfl.sync.bfly.b32 %r438|%p142, %r436, %r446, %r440, %r442;
// begin inline asm
mov.b64 %fd286, {%r438,%r439};
// end inline asm
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
st.shared.f64 [%rd29], %fd393;
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
setp.ge.u32 %p144, %r46, %r44;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
// begin inline asm
mov.b64 {%r447,%r448}, %fd392;
// end inline asm
mov.u32 %r467, 31;
mov.u32 %r468, 16;
mov.u32 %r469, -1;
shfl.sync.bfly.b32 %r450|%p145, %r448, %r468, %r467, %r469;
shfl.sync.bfly.b32 %r449|%p146, %r447, %r468, %r467, %r469;
// begin inline asm
mov.b64 %fd290, {%r449,%r450};
// end inline asm
add.f64 %fd291, %fd392, %fd290;
// begin inline asm
mov.b64 {%r451,%r452}, %fd291;
// end inline asm
mov.u32 %r470, 8;
shfl.sync.bfly.b32 %r454|%p147, %r452, %r470, %r467, %r469;
shfl.sync.bfly.b32 %r453|%p148, %r451, %r470, %r467, %r469;
// begin inline asm
mov.b64 %fd292, {%r453,%r454};
// end inline asm
add.f64 %fd293, %fd291, %fd292;
// begin inline asm
mov.b64 {%r455,%r456}, %fd293;
// end inline asm
mov.u32 %r471, 4;
shfl.sync.bfly.b32 %r458|%p149, %r456, %r471, %r467, %r469;
shfl.sync.bfly.b32 %r457|%p150, %r455, %r471, %r467, %r469;
// begin inline asm
mov.b64 %fd294, {%r457,%r458};
// end inline asm
add.f64 %fd295, %fd293, %fd294;
// begin inline asm
mov.b64 {%r459,%r460}, %fd295;
// end inline asm
mov.u32 %r472, 2;
shfl.sync.bfly.b32 %r462|%p151, %r460, %r472, %r467, %r469;
shfl.sync.bfly.b32 %r461|%p152, %r459, %r472, %r467, %r469;
// begin inline asm
mov.b64 %fd296, {%r461,%r462};
// end inline asm
add.f64 %fd297, %fd295, %fd296;
// begin inline asm
mov.b64 {%r463,%r464}, %fd297;
// end inline asm
mov.u32 %r473, 1;
shfl.sync.bfly.b32 %r466|%p153, %r464, %r473, %r467, %r469;
shfl.sync.bfly.b32 %r465|%p154, %r463, %r473, %r467, %r469;
// begin inline asm
mov.b64 %fd298, {%r465,%r466};
// end inline asm
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
mul.lo.s32 %r54, %r42, %r607;
add.s32 %r474, %r41, %r54;
setp.ge.s32 %p156, %r474, %r108;
@%p156 bra $L__BB0_109;
add.s32 %r479, %r43, %r54;
mul.wide.s32 %rd125, %r479, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
mov.b64 {%r475, %r476}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
mov.b64 {%r477, %r478}, %rd127;
// begin inline asm
st.global.cs.v4.s32 [%rd124], {%r475,%r476,%r477,%r478};
// end inline asm
$L__BB0_109:
add.s32 %r607, %r607, 1;
setp.lt.s32 %p158, %r607, %r39;
@%p158 bra $L__BB0_88;
mad.lo.s32 %r56, %r108, %r6, %r40;
shl.b32 %r57, %r36, 1;
shl.b32 %r58, %r9, 1;
mul.lo.s32 %r59, %r108, %r3;
mov.u32 %r609, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
mad.lo.s32 %r61, %r42, %r609, %r41;
mad.lo.s32 %r482, %r58, %r609, %r57;
mad.lo.s32 %r611, %r4, %r482, %r56;
mov.u32 %r612, 0;
mov.f64 %fd304, 0d0000000000000000;
mov.u32 %r610, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
setp.ge.s32 %p160, %r61, %r108;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
setp.ge.s32 %p161, %r610, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
mul.wide.s32 %rd129, %r611, 8;
add.s64 %rd128, %rd41, %rd129;
// begin inline asm
ld.volatile.global.v4.s32 {%r483,%r484,%r485,%r486}, [%rd128];
// end inline asm
mov.b64 %rd130, {%r483, %r484};
mov.b64 %fd397, %rd130;
mov.b64 %rd131, {%r485, %r486};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
add.s32 %r611, %r611, %r59;
add.s32 %r610, %r610, %r3;
add.s32 %r612, %r612, 1;
setp.lt.s32 %p162, %r612, %r47;
@%p162 bra $L__BB0_113;
$L__BB0_117:
// begin inline asm
mov.b64 {%r487,%r488}, %fd399;
// end inline asm
mov.u32 %r507, 31;
mov.u32 %r508, 16;
mov.u32 %r509, -1;
shfl.sync.bfly.b32 %r490|%p163, %r488, %r508, %r507, %r509;
shfl.sync.bfly.b32 %r489|%p164, %r487, %r508, %r507, %r509;
// begin inline asm
mov.b64 %fd310, {%r489,%r490};
// end inline asm
add.f64 %fd311, %fd399, %fd310;
// begin inline asm
mov.b64 {%r491,%r492}, %fd311;
// end inline asm
mov.u32 %r510, 8;
shfl.sync.bfly.b32 %r494|%p165, %r492, %r510, %r507, %r509;
shfl.sync.bfly.b32 %r493|%p166, %r491, %r510, %r507, %r509;
// begin inline asm
mov.b64 %fd312, {%r493,%r494};
// end inline asm
add.f64 %fd313, %fd311, %fd312;
// begin inline asm
mov.b64 {%r495,%r496}, %fd313;
// end inline asm
mov.u32 %r511, 4;
shfl.sync.bfly.b32 %r498|%p167, %r496, %r511, %r507, %r509;
shfl.sync.bfly.b32 %r497|%p168, %r495, %r511, %r507, %r509;
// begin inline asm
mov.b64 %fd314, {%r497,%r498};
// end inline asm
add.f64 %fd315, %fd313, %fd314;
// begin inline asm
mov.b64 {%r499,%r500}, %fd315;
// end inline asm
mov.u32 %r512, 2;
shfl.sync.bfly.b32 %r502|%p169, %r500, %r512, %r507, %r509;
shfl.sync.bfly.b32 %r501|%p170, %r499, %r512, %r507, %r509;
// begin inline asm
mov.b64 %fd316, {%r501,%r502};
// end inline asm
add.f64 %fd317, %fd315, %fd316;
// begin inline asm
mov.b64 {%r503,%r504}, %fd317;
// end inline asm
mov.u32 %r513, 1;
shfl.sync.bfly.b32 %r506|%p171, %r504, %r513, %r507, %r509;
shfl.sync.bfly.b32 %r505|%p172, %r503, %r513, %r507, %r509;
// begin inline asm
mov.b64 %fd318, {%r505,%r506};
// end inline asm
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
st.shared.f64 [%rd29], %fd401;
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
setp.ge.u32 %p175, %r46, %r44;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
// begin inline asm
mov.b64 {%r514,%r515}, %fd400;
// end inline asm
mov.u32 %r534, 31;
mov.u32 %r535, 16;
mov.u32 %r536, -1;
shfl.sync.bfly.b32 %r517|%p176, %r515, %r535, %r534, %r536;
shfl.sync.bfly.b32 %r516|%p177, %r514, %r535, %r534, %r536;
// begin inline asm
mov.b64 %fd321, {%r516,%r517};
// end inline asm
add.f64 %fd322, %fd400, %fd321;
// begin inline asm
mov.b64 {%r518,%r519}, %fd322;
// end inline asm
mov.u32 %r537, 8;
shfl.sync.bfly.b32 %r521|%p178, %r519, %r537, %r534, %r536;
shfl.sync.bfly.b32 %r520|%p179, %r518, %r537, %r534, %r536;
// begin inline asm
mov.b64 %fd323, {%r520,%r521};
// end inline asm
add.f64 %fd324, %fd322, %fd323;
// begin inline asm
mov.b64 {%r522,%r523}, %fd324;
// end inline asm
mov.u32 %r538, 4;
shfl.sync.bfly.b32 %r525|%p180, %r523, %r538, %r534, %r536;
shfl.sync.bfly.b32 %r524|%p181, %r522, %r538, %r534, %r536;
// begin inline asm
mov.b64 %fd325, {%r524,%r525};
// end inline asm
add.f64 %fd326, %fd324, %fd325;
// begin inline asm
mov.b64 {%r526,%r527}, %fd326;
// end inline asm
mov.u32 %r539, 2;
shfl.sync.bfly.b32 %r529|%p182, %r527, %r539, %r534, %r536;
shfl.sync.bfly.b32 %r528|%p183, %r526, %r539, %r534, %r536;
// begin inline asm
mov.b64 %fd327, {%r528,%r529};
// end inline asm
add.f64 %fd328, %fd326, %fd327;
// begin inline asm
mov.b64 {%r530,%r531}, %fd328;
// end inline asm
mov.u32 %r540, 1;
shfl.sync.bfly.b32 %r533|%p184, %r531, %r540, %r534, %r536;
shfl.sync.bfly.b32 %r532|%p185, %r530, %r540, %r534, %r536;
// begin inline asm
mov.b64 %fd329, {%r532,%r533};
// end inline asm
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r541,%r542}, %fd398;
// end inline asm
mov.u32 %r561, 31;
mov.u32 %r562, 16;
mov.u32 %r563, -1;
shfl.sync.bfly.b32 %r544|%p188, %r542, %r562, %r561, %r563;
shfl.sync.bfly.b32 %r543|%p189, %r541, %r562, %r561, %r563;
// begin inline asm
mov.b64 %fd331, {%r543,%r544};
// end inline asm
add.f64 %fd332, %fd398, %fd331;
// begin inline asm
mov.b64 {%r545,%r546}, %fd332;
// end inline asm
mov.u32 %r564, 8;
shfl.sync.bfly.b32 %r548|%p190, %r546, %r564, %r561, %r563;
shfl.sync.bfly.b32 %r547|%p191, %r545, %r564, %r561, %r563;
// begin inline asm
mov.b64 %fd333, {%r547,%r548};
// end inline asm
add.f64 %fd334, %fd332, %fd333;
// begin inline asm
mov.b64 {%r549,%r550}, %fd334;
// end inline asm
mov.u32 %r565, 4;
shfl.sync.bfly.b32 %r552|%p192, %r550, %r565, %r561, %r563;
shfl.sync.bfly.b32 %r551|%p193, %r549, %r565, %r561, %r563;
// begin inline asm
mov.b64 %fd335, {%r551,%r552};
// end inline asm
add.f64 %fd336, %fd334, %fd335;
// begin inline asm
mov.b64 {%r553,%r554}, %fd336;
// end inline asm
mov.u32 %r566, 2;
shfl.sync.bfly.b32 %r556|%p194, %r554, %r566, %r561, %r563;
shfl.sync.bfly.b32 %r555|%p195, %r553, %r566, %r561, %r563;
// begin inline asm
mov.b64 %fd337, {%r555,%r556};
// end inline asm
add.f64 %fd338, %fd336, %fd337;
// begin inline asm
mov.b64 {%r557,%r558}, %fd338;
// end inline asm
mov.u32 %r567, 1;
shfl.sync.bfly.b32 %r560|%p196, %r558, %r567, %r561, %r563;
shfl.sync.bfly.b32 %r559|%p197, %r557, %r567, %r561, %r563;
// begin inline asm
mov.b64 %fd339, {%r559,%r560};
// end inline asm
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
st.shared.f64 [%rd29], %fd403;
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
setp.ge.u32 %p199, %r46, %r44;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
// begin inline asm
mov.b64 {%r568,%r569}, %fd402;
// end inline asm
mov.u32 %r588, 31;
mov.u32 %r589, 16;
mov.u32 %r590, -1;
shfl.sync.bfly.b32 %r571|%p200, %r569, %r589, %r588, %r590;
shfl.sync.bfly.b32 %r570|%p201, %r568, %r589, %r588, %r590;
// begin inline asm
mov.b64 %fd343, {%r570,%r571};
// end inline asm
add.f64 %fd344, %fd402, %fd343;
// begin inline asm
mov.b64 {%r572,%r573}, %fd344;
// end inline asm
mov.u32 %r591, 8;
shfl.sync.bfly.b32 %r575|%p202, %r573, %r591, %r588, %r590;
shfl.sync.bfly.b32 %r574|%p203, %r572, %r591, %r588, %r590;
// begin inline asm
mov.b64 %fd345, {%r574,%r575};
// end inline asm
add.f64 %fd346, %fd344, %fd345;
// begin inline asm
mov.b64 {%r576,%r577}, %fd346;
// end inline asm
mov.u32 %r592, 4;
shfl.sync.bfly.b32 %r579|%p204, %r577, %r592, %r588, %r590;
shfl.sync.bfly.b32 %r578|%p205, %r576, %r592, %r588, %r590;
// begin inline asm
mov.b64 %fd347, {%r578,%r579};
// end inline asm
add.f64 %fd348, %fd346, %fd347;
// begin inline asm
mov.b64 {%r580,%r581}, %fd348;
// end inline asm
mov.u32 %r593, 2;
shfl.sync.bfly.b32 %r583|%p206, %r581, %r593, %r588, %r590;
shfl.sync.bfly.b32 %r582|%p207, %r580, %r593, %r588, %r590;
// begin inline asm
mov.b64 %fd349, {%r582,%r583};
// end inline asm
add.f64 %fd350, %fd348, %fd349;
// begin inline asm
mov.b64 {%r584,%r585}, %fd350;
// end inline asm
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r587|%p208, %r585, %r594, %r588, %r590;
shfl.sync.bfly.b32 %r586|%p209, %r584, %r594, %r588, %r590;
// begin inline asm
mov.b64 %fd351, {%r586,%r587};
// end inline asm
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
mul.lo.s32 %r69, %r42, %r609;
add.s32 %r595, %r41, %r69;
setp.ge.s32 %p211, %r595, %r108;
@%p211 bra $L__BB0_132;
add.s32 %r600, %r43, %r69;
mul.wide.s32 %rd133, %r600, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
mov.b64 {%r596, %r597}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
mov.b64 {%r598, %r599}, %rd135;
// begin inline asm
st.global.cs.v4.s32 [%rd132], {%r596,%r597,%r598,%r599};
// end inline asm
$L__BB0_132:
add.s32 %r609, %r609, 1;
setp.lt.s32 %p213, %r609, %r39;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72335arrayE[];
.entry _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
.reg .b32 %r<612>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
ld.param.v2.u32 {%r106, %r107}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r116, %r117}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r120, %r121}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd38, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72339nvfuser_5ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r142, %r107, 1;
shr.u32 %r143, %r142, 31;
add.s32 %r144, %r142, %r143;
shr.s32 %r2, %r144, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r145, %r2, %r3;
add.s32 %r146, %r145, 31;
shr.s32 %r147, %r146, 31;
shr.u32 %r148, %r147, 27;
add.s32 %r149, %r146, %r148;
shr.u32 %r150, %r149, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r151, %r4, %r150;
shl.b32 %r152, %r151, 8;
cvt.u64.u32 %rd1, %r152;
mul.lo.s32 %r153, %r4, %r2;
shl.b32 %r154, %r153, 4;
or.b32 %r155, %r154, 15;
and.b32 %r5, %r155, -16;
add.s32 %r156, %r155, %r5;
and.b32 %r157, %r156, -16;
cvt.s64.s32 %rd2, %r157;
mov.u64 %rd44, _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_5_cu_ce948a3f_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
cvt.rn.f64.s32 %fd1, %r107;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r158, %r7, 1;
setp.lt.s32 %p7, %r158, %r107;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r159, smem_ptr; }
// end inline asm
shl.b32 %r162, %r6, 4;
add.s32 %r160, %r159, %r162;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r161, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r161, 0;
cp.async.ca.shared.global [%r160], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r163, %r4, 215;
div.s32 %r164, %r163, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r165, %r9, %r164;
add.s32 %r166, %r165, -1;
div.s32 %r10, %r166, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r168, %ctaid.y;
mul.lo.s32 %r169, %r10, %r4;
mul.lo.s32 %r11, %r169, %r168;
mad.lo.s32 %r170, %r2, %r8, %r6;
shl.b32 %r12, %r170, 4;
mul.lo.s32 %r171, %r107, %r8;
cvt.s64.s32 %rd53, %r171;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r172, %r11, %r107;
cvt.s64.s32 %rd6, %r172;
mul.lo.s32 %r13, %r107, %r4;
mul.lo.s32 %r14, %r10, %r168;
shl.b32 %r173, %r8, 1;
mad.lo.s32 %r174, %r173, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r174, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r177, %r176, %r15;
shr.u32 %r16, %r6, 5;
add.s32 %r178, %r177, %r16;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r6, 31;
add.s32 %r179, %r177, %r17;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r176, 8;
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r600, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
// end inline asm
add.s32 %r183, %r182, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
// end inline asm
add.s32 %r193, %r192, %r12;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r180, %r600, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r185, %r13, %r600;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r184, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r184, 0;
cp.async.ca.shared.global [%r183], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r186, %r14, %r600;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd364, 0d0000000000000000;
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r188, %r14, %r600;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
add.s32 %r190, %r14, %r600;
mad.lo.s32 %r21, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
setp.gt.s32 %p16, %r21, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
mul.lo.s32 %r191, %r21, %r116;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
setp.lt.s32 %p17, %r21, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
mul.lo.s32 %r195, %r13, %r600;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r194, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r194, 0;
cp.async.ca.shared.global [%r193], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
mul.lo.s32 %r196, %r21, %r120;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd371, %fd370;
@%p18 bra $L__BB0_22;
ld.shared.v2.f64 {%fd118, %fd119}, [%rd7];
ld.shared.v2.f64 {%fd122, %fd123}, [%rd9];
ld.shared.v2.f64 {%fd126, %fd127}, [%rd11];
mul.f64 %fd130, %fd126, %fd118;
add.f64 %fd131, %fd130, 0d0000000000000000;
sub.f64 %fd132, %fd122, %fd366;
mul.f64 %fd133, %fd367, %fd132;
fma.rn.f64 %fd134, %fd130, %fd133, 0d0000000000000000;
fma.rn.f64 %fd368, %fd133, %fd118, %fd368;
mul.f64 %fd135, %fd127, %fd119;
add.f64 %fd371, %fd131, %fd135;
sub.f64 %fd136, %fd123, %fd366;
mul.f64 %fd137, %fd367, %fd136;
fma.rn.f64 %fd370, %fd135, %fd137, %fd134;
fma.rn.f64 %fd369, %fd137, %fd119, %fd369;
$L__BB0_22:
// begin inline asm
mov.b64 {%r197,%r198}, %fd371;
// end inline asm
mov.u32 %r217, 31;
mov.u32 %r218, 16;
mov.u32 %r219, -1;
shfl.sync.bfly.b32 %r200|%p21, %r198, %r218, %r217, %r219;
shfl.sync.bfly.b32 %r199|%p22, %r197, %r218, %r217, %r219;
// begin inline asm
mov.b64 %fd139, {%r199,%r200};
// end inline asm
add.f64 %fd140, %fd371, %fd139;
// begin inline asm
mov.b64 {%r201,%r202}, %fd140;
// end inline asm
mov.u32 %r220, 8;
shfl.sync.bfly.b32 %r204|%p23, %r202, %r220, %r217, %r219;
shfl.sync.bfly.b32 %r203|%p24, %r201, %r220, %r217, %r219;
// begin inline asm
mov.b64 %fd141, {%r203,%r204};
// end inline asm
add.f64 %fd142, %fd140, %fd141;
// begin inline asm
mov.b64 {%r205,%r206}, %fd142;
// end inline asm
mov.u32 %r221, 4;
shfl.sync.bfly.b32 %r208|%p25, %r206, %r221, %r217, %r219;
shfl.sync.bfly.b32 %r207|%p26, %r205, %r221, %r217, %r219;
// begin inline asm
mov.b64 %fd143, {%r207,%r208};
// end inline asm
add.f64 %fd144, %fd142, %fd143;
// begin inline asm
mov.b64 {%r209,%r210}, %fd144;
// end inline asm
mov.u32 %r222, 2;
shfl.sync.bfly.b32 %r212|%p27, %r210, %r222, %r217, %r219;
shfl.sync.bfly.b32 %r211|%p28, %r209, %r222, %r217, %r219;
// begin inline asm
mov.b64 %fd145, {%r211,%r212};
// end inline asm
add.f64 %fd146, %fd144, %fd145;
// begin inline asm
mov.b64 {%r213,%r214}, %fd146;
// end inline asm
mov.u32 %r223, 1;
shfl.sync.bfly.b32 %r216|%p29, %r214, %r223, %r217, %r219;
shfl.sync.bfly.b32 %r215|%p30, %r213, %r223, %r217, %r219;
// begin inline asm
mov.b64 %fd147, {%r215,%r216};
// end inline asm
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
setp.ne.s32 %p31, %r17, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
setp.ne.s32 %p32, %r16, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
setp.ge.u32 %p33, %r17, %r15;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
$L__BB0_27:
// begin inline asm
mov.b64 {%r224,%r225}, %fd372;
// end inline asm
mov.u32 %r244, 31;
mov.u32 %r245, 16;
mov.u32 %r246, -1;
shfl.sync.bfly.b32 %r227|%p34, %r225, %r245, %r244, %r246;
shfl.sync.bfly.b32 %r226|%p35, %r224, %r245, %r244, %r246;
// begin inline asm
mov.b64 %fd150, {%r226,%r227};
// end inline asm
add.f64 %fd151, %fd372, %fd150;
// begin inline asm
mov.b64 {%r228,%r229}, %fd151;
// end inline asm
mov.u32 %r247, 8;
shfl.sync.bfly.b32 %r231|%p36, %r229, %r247, %r244, %r246;
shfl.sync.bfly.b32 %r230|%p37, %r228, %r247, %r244, %r246;
// begin inline asm
mov.b64 %fd152, {%r230,%r231};
// end inline asm
add.f64 %fd153, %fd151, %fd152;
// begin inline asm
mov.b64 {%r232,%r233}, %fd153;
// end inline asm
mov.u32 %r248, 4;
shfl.sync.bfly.b32 %r235|%p38, %r233, %r248, %r244, %r246;
shfl.sync.bfly.b32 %r234|%p39, %r232, %r248, %r244, %r246;
// begin inline asm
mov.b64 %fd154, {%r234,%r235};
// end inline asm
add.f64 %fd155, %fd153, %fd154;
// begin inline asm
mov.b64 {%r236,%r237}, %fd155;
// end inline asm
mov.u32 %r249, 2;
shfl.sync.bfly.b32 %r239|%p40, %r237, %r249, %r244, %r246;
shfl.sync.bfly.b32 %r238|%p41, %r236, %r249, %r244, %r246;
// begin inline asm
mov.b64 %fd156, {%r238,%r239};
// end inline asm
add.f64 %fd157, %fd155, %fd156;
// begin inline asm
mov.b64 {%r240,%r241}, %fd157;
// end inline asm
mov.u32 %r250, 1;
shfl.sync.bfly.b32 %r243|%p42, %r241, %r250, %r244, %r246;
shfl.sync.bfly.b32 %r242|%p43, %r240, %r250, %r244, %r246;
// begin inline asm
mov.b64 %fd158, {%r242,%r243};
// end inline asm
add.f64 %fd373, %fd157, %fd158;
$L__BB0_28:
bar.sync 0;
// begin inline asm
mov.b64 {%r251,%r252}, %fd370;
// end inline asm
mov.u32 %r271, 31;
mov.u32 %r272, 16;
mov.u32 %r273, -1;
shfl.sync.bfly.b32 %r254|%p44, %r252, %r272, %r271, %r273;
shfl.sync.bfly.b32 %r253|%p45, %r251, %r272, %r271, %r273;
// begin inline asm
mov.b64 %fd160, {%r253,%r254};
// end inline asm
add.f64 %fd161, %fd370, %fd160;
// begin inline asm
mov.b64 {%r255,%r256}, %fd161;
// end inline asm
mov.u32 %r274, 8;
shfl.sync.bfly.b32 %r258|%p46, %r256, %r274, %r271, %r273;
shfl.sync.bfly.b32 %r257|%p47, %r255, %r274, %r271, %r273;
// begin inline asm
mov.b64 %fd162, {%r257,%r258};
// end inline asm
add.f64 %fd163, %fd161, %fd162;
// begin inline asm
mov.b64 {%r259,%r260}, %fd163;
// end inline asm
mov.u32 %r275, 4;
shfl.sync.bfly.b32 %r262|%p48, %r260, %r275, %r271, %r273;
shfl.sync.bfly.b32 %r261|%p49, %r259, %r275, %r271, %r273;
// begin inline asm
mov.b64 %fd164, {%r261,%r262};
// end inline asm
add.f64 %fd165, %fd163, %fd164;
// begin inline asm
mov.b64 {%r263,%r264}, %fd165;
// end inline asm
mov.u32 %r276, 2;
shfl.sync.bfly.b32 %r266|%p50, %r264, %r276, %r271, %r273;
shfl.sync.bfly.b32 %r265|%p51, %r263, %r276, %r271, %r273;
// begin inline asm
mov.b64 %fd166, {%r265,%r266};
// end inline asm
add.f64 %fd167, %fd165, %fd166;
// begin inline asm
mov.b64 {%r267,%r268}, %fd167;
// end inline asm
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r270|%p52, %r268, %r277, %r271, %r273;
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
// begin inline asm
mov.b64 %fd168, {%r269,%r270};
// end inline asm
add.f64 %fd375, %fd167, %fd168;
setp.eq.s32 %p4, %r17, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
$L__BB0_30:
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
setp.ge.u32 %p56, %r17, %r15;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
$L__BB0_33:
// begin inline asm
mov.b64 {%r278,%r279}, %fd374;
// end inline asm
mov.u32 %r298, 31;
mov.u32 %r299, 16;
mov.u32 %r300, -1;
shfl.sync.bfly.b32 %r281|%p57, %r279, %r299, %r298, %r300;
shfl.sync.bfly.b32 %r280|%p58, %r278, %r299, %r298, %r300;
// begin inline asm
mov.b64 %fd172, {%r280,%r281};
// end inline asm
add.f64 %fd173, %fd374, %fd172;
// begin inline asm
mov.b64 {%r282,%r283}, %fd173;
// end inline asm
mov.u32 %r301, 8;
shfl.sync.bfly.b32 %r285|%p59, %r283, %r301, %r298, %r300;
shfl.sync.bfly.b32 %r284|%p60, %r282, %r301, %r298, %r300;
// begin inline asm
mov.b64 %fd174, {%r284,%r285};
// end inline asm
add.f64 %fd175, %fd173, %fd174;
// begin inline asm
mov.b64 {%r286,%r287}, %fd175;
// end inline asm
mov.u32 %r302, 4;
shfl.sync.bfly.b32 %r289|%p61, %r287, %r302, %r298, %r300;
shfl.sync.bfly.b32 %r288|%p62, %r286, %r302, %r298, %r300;
// begin inline asm
mov.b64 %fd176, {%r288,%r289};
// end inline asm
add.f64 %fd177, %fd175, %fd176;
// begin inline asm
mov.b64 {%r290,%r291}, %fd177;
// end inline asm
mov.u32 %r303, 2;
shfl.sync.bfly.b32 %r293|%p63, %r291, %r303, %r298, %r300;
shfl.sync.bfly.b32 %r292|%p64, %r290, %r303, %r298, %r300;
// begin inline asm
mov.b64 %fd178, {%r292,%r293};
// end inline asm
add.f64 %fd179, %fd177, %fd178;
// begin inline asm
mov.b64 {%r294,%r295}, %fd179;
// end inline asm
mov.u32 %r304, 1;
shfl.sync.bfly.b32 %r297|%p65, %r295, %r304, %r298, %r300;
shfl.sync.bfly.b32 %r296|%p66, %r294, %r304, %r298, %r300;
// begin inline asm
mov.b64 %fd180, {%r296,%r297};
// end inline asm
add.f64 %fd375, %fd179, %fd180;
$L__BB0_34:
bar.sync 0;
setp.ne.s32 %p67, %r6, 0;
@%p67 bra $L__BB0_36;
st.shared.f64 [%rd12], %fd33;
$L__BB0_36:
bar.sync 0;
ld.shared.f64 %fd38, [%rd12];
bar.sync 0;
@%p67 bra $L__BB0_38;
add.f64 %fd181, %fd375, 0d0000000000000000;
selp.f64 %fd182, %fd181, 0d0000000000000000, %p4;
st.shared.f64 [%rd12], %fd182;
$L__BB0_38:
bar.sync 0;
ld.shared.f64 %fd39, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_40;
mul.f64 %fd183, %fd2, %fd367;
ld.shared.v2.f64 {%fd184, %fd185}, [%rd9];
ld.shared.v2.f64 {%fd188, %fd189}, [%rd11];
ld.shared.v2.f64 {%fd192, %fd193}, [%rd7];
mul.f64 %fd196, %fd188, %fd192;
mul.f64 %fd197, %fd196, %fd1;
sub.f64 %fd198, %fd184, %fd366;
mul.f64 %fd199, %fd367, %fd198;
sub.f64 %fd200, %fd197, %fd38;
mul.f64 %fd201, %fd39, %fd199;
sub.f64 %fd202, %fd200, %fd201;
mul.f64 %fd203, %fd183, %fd202;
mov.b64 %rd80, %fd203;
mul.f64 %fd204, %fd189, %fd193;
mul.f64 %fd205, %fd204, %fd1;
sub.f64 %fd206, %fd185, %fd366;
mul.f64 %fd207, %fd367, %fd206;
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
mad.lo.s32 %r309, %r21, %r107, %r7;
mul.wide.s32 %rd82, %r309, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
// end inline asm
$L__BB0_40:
add.s32 %r600, %r600, 1;
setp.lt.s32 %p71, %r600, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
mov.u32 %r310, %tid.z;
mad.lo.s32 %r23, %r310, %r4, %r8;
mad.lo.s32 %r24, %r23, %r3, %r6;
mul.wide.u32 %rd83, %r24, 8;
add.s64 %rd22, %rd44, %rd83;
clz.b32 %r311, %r4;
mov.u32 %r312, 31;
sub.s32 %r313, %r312, %r311;
mov.u32 %r314, 1;
shl.b32 %r25, %r314, %r313;
setp.lt.u32 %p72, %r8, %r25;
add.s32 %r315, %r25, %r8;
setp.lt.u32 %p73, %r315, %r4;
and.pred %p5, %p72, %p73;
shl.b32 %r316, %r3, %r313;
add.s32 %r317, %r24, %r316;
mul.wide.s32 %rd85, %r317, 8;
add.s64 %rd23, %rd44, %rd85;
shr.u32 %r318, %r25, 31;
add.s32 %r319, %r25, %r318;
shr.s32 %r604, %r319, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
ld.shared.f64 %fd212, [%rd23];
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
setp.lt.s32 %p75, %r25, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
mov.u32 %r601, %r604;
$L__BB0_45:
setp.ge.u32 %p76, %r8, %r601;
@%p76 bra $L__BB0_47;
mad.lo.s32 %r320, %r601, %r3, %r24;
mul.wide.s32 %rd86, %r320, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
shr.u32 %r28, %r601, 1;
setp.gt.u32 %p77, %r601, 3;
mov.u32 %r601, %r28;
@%p77 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r321, %r24, %r3;
mul.wide.u32 %rd89, %r321, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f64 %fd219, [%rd22];
add.f64 %fd380, %fd219, 0d0000000000000000;
@%p79 bra $L__BB0_51;
ld.shared.f64 %fd220, [%rd24];
add.f64 %fd380, %fd380, %fd220;
$L__BB0_51:
bar.sync 0;
st.shared.f64 [%rd22], %fd369;
bar.sync 0;
@%p74 bra $L__BB0_53;
ld.shared.f64 %fd221, [%rd23];
ld.shared.f64 %fd222, [%rd22];
add.f64 %fd223, %fd221, %fd222;
st.shared.f64 [%rd22], %fd223;
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
mov.u32 %r602, %r604;
$L__BB0_55:
setp.ge.u32 %p82, %r8, %r602;
@%p82 bra $L__BB0_57;
mad.lo.s32 %r322, %r602, %r3, %r24;
mul.wide.s32 %rd91, %r322, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
shr.u32 %r30, %r602, 1;
setp.gt.u32 %p83, %r602, 3;
mov.u32 %r602, %r30;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f64 %fd228, [%rd22];
add.f64 %fd381, %fd228, 0d0000000000000000;
@%p85 bra $L__BB0_61;
ld.shared.f64 %fd229, [%rd24];
add.f64 %fd381, %fd381, %fd229;
$L__BB0_61:
bar.sync 0;
st.shared.f64 [%rd22], %fd378;
bar.sync 0;
@%p74 bra $L__BB0_63;
ld.shared.f64 %fd230, [%rd23];
ld.shared.f64 %fd231, [%rd22];
add.f64 %fd232, %fd230, %fd231;
st.shared.f64 [%rd22], %fd232;
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
mov.u32 %r603, %r604;
$L__BB0_65:
setp.ge.u32 %p88, %r8, %r603;
@%p88 bra $L__BB0_67;
mad.lo.s32 %r323, %r603, %r3, %r24;
mul.wide.s32 %rd94, %r323, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
shr.u32 %r32, %r603, 1;
setp.gt.u32 %p89, %r603, 3;
mov.u32 %r603, %r32;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f64 %fd237, [%rd22];
add.f64 %fd382, %fd237, 0d0000000000000000;
@%p91 bra $L__BB0_71;
ld.shared.f64 %fd238, [%rd24];
add.f64 %fd382, %fd382, %fd238;
$L__BB0_71:
bar.sync 0;
st.shared.f64 [%rd22], %fd379;
bar.sync 0;
@%p74 bra $L__BB0_73;
ld.shared.f64 %fd239, [%rd23];
ld.shared.f64 %fd240, [%rd22];
add.f64 %fd241, %fd239, %fd240;
st.shared.f64 [%rd22], %fd241;
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
setp.ge.u32 %p94, %r8, %r604;
@%p94 bra $L__BB0_76;
mad.lo.s32 %r324, %r604, %r3, %r24;
mul.wide.s32 %rd97, %r324, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
shr.u32 %r34, %r604, 1;
setp.gt.u32 %p95, %r604, 3;
mov.u32 %r604, %r34;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
setp.lt.u32 %p97, %r4, 2;
ld.shared.f64 %fd246, [%rd22];
add.f64 %fd383, %fd246, 0d0000000000000000;
@%p97 bra $L__BB0_80;
ld.shared.f64 %fd247, [%rd24];
add.f64 %fd383, %fd383, %fd247;
$L__BB0_80:
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
mov.u32 %r333, %ctaid.y;
mad.lo.s32 %r334, %r107, %r333, %r7;
mul.wide.s32 %rd102, %r334, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
mov.b64 {%r325, %r326}, %rd103;
mov.b64 %rd104, %fd381;
mov.b64 {%r327, %r328}, %rd104;
// begin inline asm
st.volatile.global.v4.s32 [%rd100], {%r325,%r326,%r327,%r328};
// end inline asm
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
mov.b64 {%r329, %r330}, %rd105;
mov.b64 %rd106, %fd383;
mov.b64 {%r331, %r332}, %rd106;
// begin inline asm
st.volatile.global.v4.s32 [%rd101], {%r329,%r330,%r331,%r332};
// end inline asm
$L__BB0_82:
mov.u32 %r35, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r335, %r6, %r8;
or.b32 %r337, %r335, %r310;
setp.ne.s32 %p98, %r337, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
mov.u32 %r338, %ctaid.x;
mov.u32 %r339, %ctaid.z;
mov.u32 %r340, %nctaid.x;
mad.lo.s32 %r341, %r339, %r340, %r338;
mul.wide.s32 %rd108, %r341, 8;
add.s64 %rd27, %rd107, %rd108;
add.s32 %r342, %r9, -1;
setp.eq.s32 %p99, %r35, %r342;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
mov.u32 %r605, 8;
$L__BB0_85:
// begin inline asm
nanosleep.u32 %r605;
// end inline asm
setp.lt.u32 %p101, %r605, 256;
selp.u32 %r345, 1, 0, %p101;
shl.b32 %r605, %r605, %r345;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
add.s32 %r346, %r4, %r2;
add.s32 %r347, %r346, -1;
div.s32 %r348, %r347, %r4;
add.s32 %r349, %r9, %r348;
add.s32 %r350, %r349, -1;
div.s32 %r38, %r350, %r9;
setp.lt.s32 %p103, %r38, 1;
@%p103 bra $L__BB0_133;
add.s32 %r352, %r9, %r3;
add.s32 %r353, %r352, -1;
shl.b32 %r39, %r8, 1;
shl.b32 %r354, %r4, 1;
mad.lo.s32 %r42, %r354, %r35, %r39;
or.b32 %r40, %r42, 1;
mul.lo.s32 %r41, %r354, %r9;
shr.u32 %r43, %r3, 5;
mul.lo.s32 %r355, %r23, %r43;
shr.u32 %r44, %r6, 5;
add.s32 %r356, %r355, %r44;
mul.wide.u32 %rd117, %r356, 8;
add.s64 %rd29, %rd44, %rd117;
and.b32 %r45, %r6, 31;
add.s32 %r357, %r355, %r45;
mul.wide.u32 %rd119, %r357, 8;
add.s64 %rd30, %rd44, %rd119;
div.s32 %r46, %r353, %r3;
mov.u32 %r606, 0;
$L__BB0_88:
.pragma "nounroll";
setp.lt.s32 %p104, %r46, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
mul.lo.s32 %r359, %r41, %r606;
add.s32 %r48, %r40, %r359;
add.s32 %r49, %r42, %r359;
mov.u32 %r607, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
setp.ge.s32 %p105, %r48, %r107;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
mad.lo.s32 %r51, %r607, %r3, %r6;
setp.ge.s32 %p106, %r51, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
mad.lo.s32 %r364, %r51, %r107, %r49;
mul.wide.s32 %rd121, %r364, 8;
add.s64 %rd120, %rd42, %rd121;
// begin inline asm
ld.volatile.global.v4.s32 {%r360,%r361,%r362,%r363}, [%rd120];
// end inline asm
mov.b64 %rd122, {%r360, %r361};
mov.b64 %fd387, %rd122;
mov.b64 %rd123, {%r362, %r363};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
add.s32 %r607, %r607, 1;
setp.lt.s32 %p107, %r607, %r46;
@%p107 bra $L__BB0_90;
$L__BB0_94:
// begin inline asm
mov.b64 {%r365,%r366}, %fd389;
// end inline asm
mov.u32 %r385, 31;
mov.u32 %r386, 16;
mov.u32 %r387, -1;
shfl.sync.bfly.b32 %r368|%p108, %r366, %r386, %r385, %r387;
shfl.sync.bfly.b32 %r367|%p109, %r365, %r386, %r385, %r387;
// begin inline asm
mov.b64 %fd257, {%r367,%r368};
// end inline asm
add.f64 %fd258, %fd389, %fd257;
// begin inline asm
mov.b64 {%r369,%r370}, %fd258;
// end inline asm
mov.u32 %r388, 8;
shfl.sync.bfly.b32 %r372|%p110, %r370, %r388, %r385, %r387;
shfl.sync.bfly.b32 %r371|%p111, %r369, %r388, %r385, %r387;
// begin inline asm
mov.b64 %fd259, {%r371,%r372};
// end inline asm
add.f64 %fd260, %fd258, %fd259;
// begin inline asm
mov.b64 {%r373,%r374}, %fd260;
// end inline asm
mov.u32 %r389, 4;
shfl.sync.bfly.b32 %r376|%p112, %r374, %r389, %r385, %r387;
shfl.sync.bfly.b32 %r375|%p113, %r373, %r389, %r385, %r387;
// begin inline asm
mov.b64 %fd261, {%r375,%r376};
// end inline asm
add.f64 %fd262, %fd260, %fd261;
// begin inline asm
mov.b64 {%r377,%r378}, %fd262;
// end inline asm
mov.u32 %r390, 2;
shfl.sync.bfly.b32 %r380|%p114, %r378, %r390, %r385, %r387;
shfl.sync.bfly.b32 %r379|%p115, %r377, %r390, %r385, %r387;
// begin inline asm
mov.b64 %fd263, {%r379,%r380};
// end inline asm
add.f64 %fd264, %fd262, %fd263;
// begin inline asm
mov.b64 {%r381,%r382}, %fd264;
// end inline asm
mov.u32 %r391, 1;
shfl.sync.bfly.b32 %r384|%p116, %r382, %r391, %r385, %r387;
shfl.sync.bfly.b32 %r383|%p117, %r381, %r391, %r385, %r387;
// begin inline asm
mov.b64 %fd265, {%r383,%r384};
// end inline asm
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
setp.ne.s32 %p118, %r45, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
setp.ne.s32 %p119, %r44, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
setp.ge.u32 %p120, %r45, %r43;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
// begin inline asm
mov.b64 {%r392,%r393}, %fd390;
// end inline asm
mov.u32 %r412, 31;
mov.u32 %r413, 16;
mov.u32 %r414, -1;
shfl.sync.bfly.b32 %r395|%p121, %r393, %r413, %r412, %r414;
shfl.sync.bfly.b32 %r394|%p122, %r392, %r413, %r412, %r414;
// begin inline asm
mov.b64 %fd268, {%r394,%r395};
// end inline asm
add.f64 %fd269, %fd390, %fd268;
// begin inline asm
mov.b64 {%r396,%r397}, %fd269;
// end inline asm
mov.u32 %r415, 8;
shfl.sync.bfly.b32 %r399|%p123, %r397, %r415, %r412, %r414;
shfl.sync.bfly.b32 %r398|%p124, %r396, %r415, %r412, %r414;
// begin inline asm
mov.b64 %fd270, {%r398,%r399};
// end inline asm
add.f64 %fd271, %fd269, %fd270;
// begin inline asm
mov.b64 {%r400,%r401}, %fd271;
// end inline asm
mov.u32 %r416, 4;
shfl.sync.bfly.b32 %r403|%p125, %r401, %r416, %r412, %r414;
shfl.sync.bfly.b32 %r402|%p126, %r400, %r416, %r412, %r414;
// begin inline asm
mov.b64 %fd272, {%r402,%r403};
// end inline asm
add.f64 %fd273, %fd271, %fd272;
// begin inline asm
mov.b64 {%r404,%r405}, %fd273;
// end inline asm
mov.u32 %r417, 2;
shfl.sync.bfly.b32 %r407|%p127, %r405, %r417, %r412, %r414;
shfl.sync.bfly.b32 %r406|%p128, %r404, %r417, %r412, %r414;
// begin inline asm
mov.b64 %fd274, {%r406,%r407};
// end inline asm
add.f64 %fd275, %fd273, %fd274;
// begin inline asm
mov.b64 {%r408,%r409}, %fd275;
// end inline asm
mov.u32 %r418, 1;
shfl.sync.bfly.b32 %r411|%p129, %r409, %r418, %r412, %r414;
shfl.sync.bfly.b32 %r410|%p130, %r408, %r418, %r412, %r414;
// begin inline asm
mov.b64 %fd276, {%r410,%r411};
// end inline asm
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
setp.eq.s32 %p132, %r45, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r419,%r420}, %fd388;
// end inline asm
mov.u32 %r439, 31;
mov.u32 %r440, 16;
mov.u32 %r441, -1;
shfl.sync.bfly.b32 %r422|%p133, %r420, %r440, %r439, %r441;
shfl.sync.bfly.b32 %r421|%p134, %r419, %r440, %r439, %r441;
// begin inline asm
mov.b64 %fd278, {%r421,%r422};
// end inline asm
add.f64 %fd279, %fd388, %fd278;
// begin inline asm
mov.b64 {%r423,%r424}, %fd279;
// end inline asm
mov.u32 %r442, 8;
shfl.sync.bfly.b32 %r426|%p135, %r424, %r442, %r439, %r441;
shfl.sync.bfly.b32 %r425|%p136, %r423, %r442, %r439, %r441;
// begin inline asm
mov.b64 %fd280, {%r425,%r426};
// end inline asm
add.f64 %fd281, %fd279, %fd280;
// begin inline asm
mov.b64 {%r427,%r428}, %fd281;
// end inline asm
mov.u32 %r443, 4;
shfl.sync.bfly.b32 %r430|%p137, %r428, %r443, %r439, %r441;
shfl.sync.bfly.b32 %r429|%p138, %r427, %r443, %r439, %r441;
// begin inline asm
mov.b64 %fd282, {%r429,%r430};
// end inline asm
add.f64 %fd283, %fd281, %fd282;
// begin inline asm
mov.b64 {%r431,%r432}, %fd283;
// end inline asm
mov.u32 %r444, 2;
shfl.sync.bfly.b32 %r434|%p139, %r432, %r444, %r439, %r441;
shfl.sync.bfly.b32 %r433|%p140, %r431, %r444, %r439, %r441;
// begin inline asm
mov.b64 %fd284, {%r433,%r434};
// end inline asm
add.f64 %fd285, %fd283, %fd284;
// begin inline asm
mov.b64 {%r435,%r436}, %fd285;
// end inline asm
mov.u32 %r445, 1;
shfl.sync.bfly.b32 %r438|%p141, %r436, %r445, %r439, %r441;
shfl.sync.bfly.b32 %r437|%p142, %r435, %r445, %r439, %r441;
// begin inline asm
mov.b64 %fd286, {%r437,%r438};
// end inline asm
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
st.shared.f64 [%rd29], %fd393;
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
setp.ge.u32 %p144, %r45, %r43;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
// begin inline asm
mov.b64 {%r446,%r447}, %fd392;
// end inline asm
mov.u32 %r466, 31;
mov.u32 %r467, 16;
mov.u32 %r468, -1;
shfl.sync.bfly.b32 %r449|%p145, %r447, %r467, %r466, %r468;
shfl.sync.bfly.b32 %r448|%p146, %r446, %r467, %r466, %r468;
// begin inline asm
mov.b64 %fd290, {%r448,%r449};
// end inline asm
add.f64 %fd291, %fd392, %fd290;
// begin inline asm
mov.b64 {%r450,%r451}, %fd291;
// end inline asm
mov.u32 %r469, 8;
shfl.sync.bfly.b32 %r453|%p147, %r451, %r469, %r466, %r468;
shfl.sync.bfly.b32 %r452|%p148, %r450, %r469, %r466, %r468;
// begin inline asm
mov.b64 %fd292, {%r452,%r453};
// end inline asm
add.f64 %fd293, %fd291, %fd292;
// begin inline asm
mov.b64 {%r454,%r455}, %fd293;
// end inline asm
mov.u32 %r470, 4;
shfl.sync.bfly.b32 %r457|%p149, %r455, %r470, %r466, %r468;
shfl.sync.bfly.b32 %r456|%p150, %r454, %r470, %r466, %r468;
// begin inline asm
mov.b64 %fd294, {%r456,%r457};
// end inline asm
add.f64 %fd295, %fd293, %fd294;
// begin inline asm
mov.b64 {%r458,%r459}, %fd295;
// end inline asm
mov.u32 %r471, 2;
shfl.sync.bfly.b32 %r461|%p151, %r459, %r471, %r466, %r468;
shfl.sync.bfly.b32 %r460|%p152, %r458, %r471, %r466, %r468;
// begin inline asm
mov.b64 %fd296, {%r460,%r461};
// end inline asm
add.f64 %fd297, %fd295, %fd296;
// begin inline asm
mov.b64 {%r462,%r463}, %fd297;
// end inline asm
mov.u32 %r472, 1;
shfl.sync.bfly.b32 %r465|%p153, %r463, %r472, %r466, %r468;
shfl.sync.bfly.b32 %r464|%p154, %r462, %r472, %r466, %r468;
// begin inline asm
mov.b64 %fd298, {%r464,%r465};
// end inline asm
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
mul.lo.s32 %r53, %r41, %r606;
add.s32 %r473, %r40, %r53;
setp.ge.s32 %p156, %r473, %r107;
@%p156 bra $L__BB0_109;
add.s32 %r478, %r42, %r53;
mul.wide.s32 %rd125, %r478, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
mov.b64 {%r474, %r475}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
mov.b64 {%r476, %r477}, %rd127;
// begin inline asm
st.global.cs.v4.s32 [%rd124], {%r474,%r475,%r476,%r477};
// end inline asm
$L__BB0_109:
add.s32 %r606, %r606, 1;
setp.lt.s32 %p158, %r606, %r38;
@%p158 bra $L__BB0_88;
mad.lo.s32 %r55, %r107, %r6, %r39;
shl.b32 %r56, %r35, 1;
shl.b32 %r57, %r9, 1;
mul.lo.s32 %r58, %r107, %r3;
mov.u32 %r608, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
mad.lo.s32 %r60, %r41, %r608, %r40;
mad.lo.s32 %r481, %r57, %r608, %r56;
mad.lo.s32 %r610, %r4, %r481, %r55;
mov.u32 %r611, 0;
mov.f64 %fd304, 0d0000000000000000;
mov.u32 %r609, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
setp.ge.s32 %p160, %r60, %r107;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
setp.ge.s32 %p161, %r609, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
mul.wide.s32 %rd129, %r610, 8;
add.s64 %rd128, %rd41, %rd129;
// begin inline asm
ld.volatile.global.v4.s32 {%r482,%r483,%r484,%r485}, [%rd128];
// end inline asm
mov.b64 %rd130, {%r482, %r483};
mov.b64 %fd397, %rd130;
mov.b64 %rd131, {%r484, %r485};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
add.s32 %r610, %r610, %r58;
add.s32 %r609, %r609, %r3;
add.s32 %r611, %r611, 1;
setp.lt.s32 %p162, %r611, %r46;
@%p162 bra $L__BB0_113;
$L__BB0_117:
// begin inline asm
mov.b64 {%r486,%r487}, %fd399;
// end inline asm
mov.u32 %r506, 31;
mov.u32 %r507, 16;
mov.u32 %r508, -1;
shfl.sync.bfly.b32 %r489|%p163, %r487, %r507, %r506, %r508;
shfl.sync.bfly.b32 %r488|%p164, %r486, %r507, %r506, %r508;
// begin inline asm
mov.b64 %fd310, {%r488,%r489};
// end inline asm
add.f64 %fd311, %fd399, %fd310;
// begin inline asm
mov.b64 {%r490,%r491}, %fd311;
// end inline asm
mov.u32 %r509, 8;
shfl.sync.bfly.b32 %r493|%p165, %r491, %r509, %r506, %r508;
shfl.sync.bfly.b32 %r492|%p166, %r490, %r509, %r506, %r508;
// begin inline asm
mov.b64 %fd312, {%r492,%r493};
// end inline asm
add.f64 %fd313, %fd311, %fd312;
// begin inline asm
mov.b64 {%r494,%r495}, %fd313;
// end inline asm
mov.u32 %r510, 4;
shfl.sync.bfly.b32 %r497|%p167, %r495, %r510, %r506, %r508;
shfl.sync.bfly.b32 %r496|%p168, %r494, %r510, %r506, %r508;
// begin inline asm
mov.b64 %fd314, {%r496,%r497};
// end inline asm
add.f64 %fd315, %fd313, %fd314;
// begin inline asm
mov.b64 {%r498,%r499}, %fd315;
// end inline asm
mov.u32 %r511, 2;
shfl.sync.bfly.b32 %r501|%p169, %r499, %r511, %r506, %r508;
shfl.sync.bfly.b32 %r500|%p170, %r498, %r511, %r506, %r508;
// begin inline asm
mov.b64 %fd316, {%r500,%r501};
// end inline asm
add.f64 %fd317, %fd315, %fd316;
// begin inline asm
mov.b64 {%r502,%r503}, %fd317;
// end inline asm
mov.u32 %r512, 1;
shfl.sync.bfly.b32 %r505|%p171, %r503, %r512, %r506, %r508;
shfl.sync.bfly.b32 %r504|%p172, %r502, %r512, %r506, %r508;
// begin inline asm
mov.b64 %fd318, {%r504,%r505};
// end inline asm
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
st.shared.f64 [%rd29], %fd401;
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
setp.ge.u32 %p175, %r45, %r43;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
// begin inline asm
mov.b64 {%r513,%r514}, %fd400;
// end inline asm
mov.u32 %r533, 31;
mov.u32 %r534, 16;
mov.u32 %r535, -1;
shfl.sync.bfly.b32 %r516|%p176, %r514, %r534, %r533, %r535;
shfl.sync.bfly.b32 %r515|%p177, %r513, %r534, %r533, %r535;
// begin inline asm
mov.b64 %fd321, {%r515,%r516};
// end inline asm
add.f64 %fd322, %fd400, %fd321;
// begin inline asm
mov.b64 {%r517,%r518}, %fd322;
// end inline asm
mov.u32 %r536, 8;
shfl.sync.bfly.b32 %r520|%p178, %r518, %r536, %r533, %r535;
shfl.sync.bfly.b32 %r519|%p179, %r517, %r536, %r533, %r535;
// begin inline asm
mov.b64 %fd323, {%r519,%r520};
// end inline asm
add.f64 %fd324, %fd322, %fd323;
// begin inline asm
mov.b64 {%r521,%r522}, %fd324;
// end inline asm
mov.u32 %r537, 4;
shfl.sync.bfly.b32 %r524|%p180, %r522, %r537, %r533, %r535;
shfl.sync.bfly.b32 %r523|%p181, %r521, %r537, %r533, %r535;
// begin inline asm
mov.b64 %fd325, {%r523,%r524};
// end inline asm
add.f64 %fd326, %fd324, %fd325;
// begin inline asm
mov.b64 {%r525,%r526}, %fd326;
// end inline asm
mov.u32 %r538, 2;
shfl.sync.bfly.b32 %r528|%p182, %r526, %r538, %r533, %r535;
shfl.sync.bfly.b32 %r527|%p183, %r525, %r538, %r533, %r535;
// begin inline asm
mov.b64 %fd327, {%r527,%r528};
// end inline asm
add.f64 %fd328, %fd326, %fd327;
// begin inline asm
mov.b64 {%r529,%r530}, %fd328;
// end inline asm
mov.u32 %r539, 1;
shfl.sync.bfly.b32 %r532|%p184, %r530, %r539, %r533, %r535;
shfl.sync.bfly.b32 %r531|%p185, %r529, %r539, %r533, %r535;
// begin inline asm
mov.b64 %fd329, {%r531,%r532};
// end inline asm
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r540,%r541}, %fd398;
// end inline asm
mov.u32 %r560, 31;
mov.u32 %r561, 16;
mov.u32 %r562, -1;
shfl.sync.bfly.b32 %r543|%p188, %r541, %r561, %r560, %r562;
shfl.sync.bfly.b32 %r542|%p189, %r540, %r561, %r560, %r562;
// begin inline asm
mov.b64 %fd331, {%r542,%r543};
// end inline asm
add.f64 %fd332, %fd398, %fd331;
// begin inline asm
mov.b64 {%r544,%r545}, %fd332;
// end inline asm
mov.u32 %r563, 8;
shfl.sync.bfly.b32 %r547|%p190, %r545, %r563, %r560, %r562;
shfl.sync.bfly.b32 %r546|%p191, %r544, %r563, %r560, %r562;
// begin inline asm
mov.b64 %fd333, {%r546,%r547};
// end inline asm
add.f64 %fd334, %fd332, %fd333;
// begin inline asm
mov.b64 {%r548,%r549}, %fd334;
// end inline asm
mov.u32 %r564, 4;
shfl.sync.bfly.b32 %r551|%p192, %r549, %r564, %r560, %r562;
shfl.sync.bfly.b32 %r550|%p193, %r548, %r564, %r560, %r562;
// begin inline asm
mov.b64 %fd335, {%r550,%r551};
// end inline asm
add.f64 %fd336, %fd334, %fd335;
// begin inline asm
mov.b64 {%r552,%r553}, %fd336;
// end inline asm
mov.u32 %r565, 2;
shfl.sync.bfly.b32 %r555|%p194, %r553, %r565, %r560, %r562;
shfl.sync.bfly.b32 %r554|%p195, %r552, %r565, %r560, %r562;
// begin inline asm
mov.b64 %fd337, {%r554,%r555};
// end inline asm
add.f64 %fd338, %fd336, %fd337;
// begin inline asm
mov.b64 {%r556,%r557}, %fd338;
// end inline asm
mov.u32 %r566, 1;
shfl.sync.bfly.b32 %r559|%p196, %r557, %r566, %r560, %r562;
shfl.sync.bfly.b32 %r558|%p197, %r556, %r566, %r560, %r562;
// begin inline asm
mov.b64 %fd339, {%r558,%r559};
// end inline asm
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
st.shared.f64 [%rd29], %fd403;
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
setp.ge.u32 %p199, %r45, %r43;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
// begin inline asm
mov.b64 {%r567,%r568}, %fd402;
// end inline asm
mov.u32 %r587, 31;
mov.u32 %r588, 16;
mov.u32 %r589, -1;
shfl.sync.bfly.b32 %r570|%p200, %r568, %r588, %r587, %r589;
shfl.sync.bfly.b32 %r569|%p201, %r567, %r588, %r587, %r589;
// begin inline asm
mov.b64 %fd343, {%r569,%r570};
// end inline asm
add.f64 %fd344, %fd402, %fd343;
// begin inline asm
mov.b64 {%r571,%r572}, %fd344;
// end inline asm
mov.u32 %r590, 8;
shfl.sync.bfly.b32 %r574|%p202, %r572, %r590, %r587, %r589;
shfl.sync.bfly.b32 %r573|%p203, %r571, %r590, %r587, %r589;
// begin inline asm
mov.b64 %fd345, {%r573,%r574};
// end inline asm
add.f64 %fd346, %fd344, %fd345;
// begin inline asm
mov.b64 {%r575,%r576}, %fd346;
// end inline asm
mov.u32 %r591, 4;
shfl.sync.bfly.b32 %r578|%p204, %r576, %r591, %r587, %r589;
shfl.sync.bfly.b32 %r577|%p205, %r575, %r591, %r587, %r589;
// begin inline asm
mov.b64 %fd347, {%r577,%r578};
// end inline asm
add.f64 %fd348, %fd346, %fd347;
// begin inline asm
mov.b64 {%r579,%r580}, %fd348;
// end inline asm
mov.u32 %r592, 2;
shfl.sync.bfly.b32 %r582|%p206, %r580, %r592, %r587, %r589;
shfl.sync.bfly.b32 %r581|%p207, %r579, %r592, %r587, %r589;
// begin inline asm
mov.b64 %fd349, {%r581,%r582};
// end inline asm
add.f64 %fd350, %fd348, %fd349;
// begin inline asm
mov.b64 {%r583,%r584}, %fd350;
// end inline asm
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r586|%p208, %r584, %r593, %r587, %r589;
shfl.sync.bfly.b32 %r585|%p209, %r583, %r593, %r587, %r589;
// begin inline asm
mov.b64 %fd351, {%r585,%r586};
// end inline asm
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
mul.lo.s32 %r68, %r41, %r608;
add.s32 %r594, %r40, %r68;
setp.ge.s32 %p211, %r594, %r107;
@%p211 bra $L__BB0_132;
add.s32 %r599, %r42, %r68;
mul.wide.s32 %rd133, %r599, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
mov.b64 {%r595, %r596}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
mov.b64 {%r597, %r598}, %rd135;
// begin inline asm
st.global.cs.v4.s32 [%rd132], {%r595,%r596,%r597,%r598};
// end inline asm
$L__BB0_132:
add.s32 %r608, %r608, 1;
setp.lt.s32 %p213, %r608, %r38;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -29,18 +29,18 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
- .reg .b32 %r<613>;
+ .reg .b32 %r<612>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
- ld.param.v2.u32 {%r107, %r108}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r117, %r118}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r121, %r122}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r106, %r107}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r116, %r117}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r120, %r121}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
@@ -48,113 +48,113 @@
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r143, %r108, 1;
- shr.u32 %r144, %r143, 31;
- add.s32 %r145, %r143, %r144;
- shr.s32 %r2, %r145, 1;
+ add.s32 %r142, %r107, 1;
+ shr.u32 %r143, %r142, 31;
+ add.s32 %r144, %r142, %r143;
+ shr.s32 %r2, %r144, 1;
mov.u32 %r3, %ntid.x;
- max.s32 %r146, %r2, %r3;
- add.s32 %r147, %r146, 31;
- shr.s32 %r148, %r147, 31;
- shr.u32 %r149, %r148, 27;
- add.s32 %r150, %r147, %r149;
- shr.u32 %r151, %r150, 5;
+ max.s32 %r145, %r2, %r3;
+ add.s32 %r146, %r145, 31;
+ shr.s32 %r147, %r146, 31;
+ shr.u32 %r148, %r147, 27;
+ add.s32 %r149, %r146, %r148;
+ shr.u32 %r150, %r149, 5;
mov.u32 %r4, %ntid.y;
- mul.lo.s32 %r152, %r4, %r151;
- shl.b32 %r153, %r152, 8;
- cvt.u64.u32 %rd1, %r153;
- mul.lo.s32 %r154, %r4, %r2;
- shl.b32 %r155, %r154, 4;
- or.b32 %r156, %r155, 15;
- and.b32 %r5, %r156, -16;
- add.s32 %r157, %r156, %r5;
- and.b32 %r158, %r157, -16;
- cvt.s64.s32 %rd2, %r158;
+ mul.lo.s32 %r151, %r4, %r150;
+ shl.b32 %r152, %r151, 8;
+ cvt.u64.u32 %rd1, %r152;
+ mul.lo.s32 %r153, %r4, %r2;
+ shl.b32 %r154, %r153, 4;
+ or.b32 %r155, %r154, 15;
+ and.b32 %r5, %r155, -16;
+ add.s32 %r156, %r155, %r5;
+ and.b32 %r157, %r156, -16;
+ cvt.s64.s32 %rd2, %r157;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
- cvt.rn.f64.s32 %fd1, %r108;
+ cvt.rn.f64.s32 %fd1, %r107;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
- or.b32 %r159, %r7, 1;
- setp.lt.s32 %p7, %r159, %r108;
+ or.b32 %r158, %r7, 1;
+ setp.lt.s32 %p7, %r158, %r107;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r160, smem_ptr; }
-
-
- shl.b32 %r163, %r6, 4;
- add.s32 %r161, %r160, %r163;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r159, smem_ptr; }
+
+
+ shl.b32 %r162, %r6, 4;
+ add.s32 %r160, %r159, %r162;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
- mov.u32 %r162, 0;
+ mov.u32 %r161, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r162, 0;
- cp.async.ca.shared.global [%r161], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r161, 0;
+ cp.async.ca.shared.global [%r160], [%rd47], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r164, %r4, 215;
- div.s32 %r165, %r164, %r4;
+ add.s32 %r163, %r4, 215;
+ div.s32 %r164, %r163, %r4;
mov.u32 %r9, %nctaid.y;
- add.s32 %r166, %r9, %r165;
- add.s32 %r167, %r166, -1;
- div.s32 %r10, %r167, %r9;
+ add.s32 %r165, %r9, %r164;
+ add.s32 %r166, %r165, -1;
+ div.s32 %r10, %r166, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
- mov.u32 %r169, %ctaid.y;
- mul.lo.s32 %r170, %r10, %r4;
- mul.lo.s32 %r11, %r170, %r169;
- shl.b32 %r171, %r8, 3;
- shl.b32 %r172, %r6, 4;
- mad.lo.s32 %r12, %r171, %r108, %r172;
- mul.lo.s32 %r173, %r108, %r8;
- cvt.s64.s32 %rd53, %r173;
+ mov.u32 %r168, %ctaid.y;
+ mul.lo.s32 %r169, %r10, %r4;
+ mul.lo.s32 %r11, %r169, %r168;
+ mad.lo.s32 %r170, %r2, %r8, %r6;
+ shl.b32 %r12, %r170, 4;
+ mul.lo.s32 %r171, %r107, %r8;
+ cvt.s64.s32 %rd53, %r171;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r174, %r11, %r108;
- cvt.s64.s32 %rd6, %r174;
- mul.lo.s32 %r13, %r108, %r4;
- mul.lo.s32 %r14, %r10, %r169;
- add.s32 %r15, %r173, %r7;
+ mul.lo.s32 %r172, %r11, %r107;
+ cvt.s64.s32 %rd6, %r172;
+ mul.lo.s32 %r13, %r107, %r4;
+ mul.lo.s32 %r14, %r10, %r168;
+ shl.b32 %r173, %r8, 1;
+ mad.lo.s32 %r174, %r173, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 8;
+ mul.wide.s32 %rd56, %r174, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r177, %r176, %r16;
- shr.u32 %r17, %r6, 5;
- add.s32 %r178, %r177, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r177, %r176, %r15;
+ shr.u32 %r16, %r6, 5;
+ add.s32 %r178, %r177, %r16;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
- and.b32 %r18, %r6, 31;
- add.s32 %r179, %r177, %r18;
+ and.b32 %r17, %r6, 31;
+ add.s32 %r179, %r177, %r17;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
@@ -162,37 +162,37 @@
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
- mov.u32 %r601, 0;
+ mov.u32 %r600, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
- add.s32 %r183, %r12, %r182;
+ add.s32 %r183, %r182, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
- add.s32 %r193, %r12, %r192;
+ add.s32 %r193, %r192, %r12;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
- mad.lo.s32 %r180, %r601, %r4, %r8;
+ mad.lo.s32 %r180, %r600, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
- mul.lo.s32 %r185, %r13, %r601;
+ mul.lo.s32 %r185, %r13, %r600;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
@@ -211,11 +211,11 @@
cp.async.wait_all;
@%p11 bra $L__BB0_10;
- add.s32 %r186, %r14, %r601;
+ add.s32 %r186, %r14, %r600;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
@@ -228,38 +228,38 @@
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
- add.s32 %r188, %r14, %r601;
+ add.s32 %r188, %r14, %r600;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
- add.s32 %r190, %r14, %r601;
- mad.lo.s32 %r22, %r190, %r4, %r8;
+ add.s32 %r190, %r14, %r600;
+ mad.lo.s32 %r21, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
- setp.gt.s32 %p16, %r22, 215;
+ setp.gt.s32 %p16, %r21, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
- mul.lo.s32 %r191, %r22, %r117;
+ mul.lo.s32 %r191, %r21, %r116;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
- setp.lt.s32 %p17, %r22, 216;
+ setp.lt.s32 %p17, %r21, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
- mul.lo.s32 %r195, %r13, %r601;
+ mul.lo.s32 %r195, %r13, %r600;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
@@ -276,11 +276,11 @@
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
- mul.lo.s32 %r196, %r22, %r121;
+ mul.lo.s32 %r196, %r21, %r120;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
@@ -359,21 +359,21 @@
mov.b64 %fd147, {%r215,%r216};
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
- setp.ne.s32 %p31, %r18, 0;
+ setp.ne.s32 %p31, %r17, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
- setp.ne.s32 %p32, %r17, 0;
+ setp.ne.s32 %p32, %r16, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
- setp.ge.u32 %p33, %r18, %r16;
+ setp.ge.u32 %p33, %r17, %r15;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
@@ -483,11 +483,11 @@
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
mov.b64 %fd168, {%r269,%r270};
add.f64 %fd375, %fd167, %fd168;
- setp.eq.s32 %p4, %r18, 0;
+ setp.eq.s32 %p4, %r17, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
@@ -495,11 +495,11 @@
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
- setp.ge.u32 %p56, %r18, %r16;
+ setp.ge.u32 %p56, %r17, %r15;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
@@ -600,54 +600,53 @@
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
- mad.lo.s32 %r309, %r601, %r4, %r11;
- mad.lo.s32 %r310, %r309, %r108, %r15;
- mul.wide.s32 %rd82, %r310, 8;
+ mad.lo.s32 %r309, %r21, %r107, %r7;
+ mul.wide.s32 %rd82, %r309, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
$L__BB0_40:
- add.s32 %r601, %r601, 1;
- setp.lt.s32 %p71, %r601, %r10;
+ add.s32 %r600, %r600, 1;
+ setp.lt.s32 %p71, %r600, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
- mov.u32 %r311, %tid.z;
- mad.lo.s32 %r24, %r311, %r4, %r8;
- mad.lo.s32 %r25, %r24, %r3, %r6;
- mul.wide.u32 %rd83, %r25, 8;
+ mov.u32 %r310, %tid.z;
+ mad.lo.s32 %r23, %r310, %r4, %r8;
+ mad.lo.s32 %r24, %r23, %r3, %r6;
+ mul.wide.u32 %rd83, %r24, 8;
add.s64 %rd22, %rd44, %rd83;
- clz.b32 %r312, %r4;
- mov.u32 %r313, 31;
- sub.s32 %r314, %r313, %r312;
- mov.u32 %r315, 1;
- shl.b32 %r26, %r315, %r314;
- setp.lt.u32 %p72, %r8, %r26;
- add.s32 %r316, %r26, %r8;
- setp.lt.u32 %p73, %r316, %r4;
+ clz.b32 %r311, %r4;
+ mov.u32 %r312, 31;
+ sub.s32 %r313, %r312, %r311;
+ mov.u32 %r314, 1;
+ shl.b32 %r25, %r314, %r313;
+ setp.lt.u32 %p72, %r8, %r25;
+ add.s32 %r315, %r25, %r8;
+ setp.lt.u32 %p73, %r315, %r4;
and.pred %p5, %p72, %p73;
- shl.b32 %r317, %r3, %r314;
- add.s32 %r318, %r25, %r317;
- mul.wide.s32 %rd85, %r318, 8;
+ shl.b32 %r316, %r3, %r313;
+ add.s32 %r317, %r24, %r316;
+ mul.wide.s32 %rd85, %r317, 8;
add.s64 %rd23, %rd44, %rd85;
- shr.u32 %r319, %r26, 31;
- add.s32 %r320, %r26, %r319;
- shr.s32 %r605, %r320, 1;
+ shr.u32 %r318, %r25, 31;
+ add.s32 %r319, %r25, %r318;
+ shr.s32 %r604, %r319, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
@@ -655,38 +654,38 @@
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
- setp.lt.s32 %p75, %r26, 4;
+ setp.lt.s32 %p75, %r25, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
- mov.u32 %r602, %r605;
+ mov.u32 %r601, %r604;
$L__BB0_45:
- setp.ge.u32 %p76, %r8, %r602;
+ setp.ge.u32 %p76, %r8, %r601;
@%p76 bra $L__BB0_47;
- mad.lo.s32 %r321, %r602, %r3, %r25;
- mul.wide.s32 %rd86, %r321, 8;
+ mad.lo.s32 %r320, %r601, %r3, %r24;
+ mul.wide.s32 %rd86, %r320, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
- shr.u32 %r29, %r602, 1;
- setp.gt.u32 %p77, %r602, 3;
- mov.u32 %r602, %r29;
+ shr.u32 %r28, %r601, 1;
+ setp.gt.u32 %p77, %r601, 3;
+ mov.u32 %r601, %r28;
@%p77 bra $L__BB0_45;
$L__BB0_48:
- add.s32 %r322, %r25, %r3;
- mul.wide.u32 %rd89, %r322, 8;
+ add.s32 %r321, %r24, %r3;
+ mul.wide.u32 %rd89, %r321, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
@@ -711,29 +710,29 @@
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
- mov.u32 %r603, %r605;
+ mov.u32 %r602, %r604;
$L__BB0_55:
- setp.ge.u32 %p82, %r8, %r603;
+ setp.ge.u32 %p82, %r8, %r602;
@%p82 bra $L__BB0_57;
- mad.lo.s32 %r323, %r603, %r3, %r25;
- mul.wide.s32 %rd91, %r323, 8;
+ mad.lo.s32 %r322, %r602, %r3, %r24;
+ mul.wide.s32 %rd91, %r322, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
- shr.u32 %r31, %r603, 1;
- setp.gt.u32 %p83, %r603, 3;
- mov.u32 %r603, %r31;
+ shr.u32 %r30, %r602, 1;
+ setp.gt.u32 %p83, %r602, 3;
+ mov.u32 %r602, %r30;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
@@ -759,29 +758,29 @@
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
- mov.u32 %r604, %r605;
+ mov.u32 %r603, %r604;
$L__BB0_65:
- setp.ge.u32 %p88, %r8, %r604;
+ setp.ge.u32 %p88, %r8, %r603;
@%p88 bra $L__BB0_67;
- mad.lo.s32 %r324, %r604, %r3, %r25;
- mul.wide.s32 %rd94, %r324, 8;
+ mad.lo.s32 %r323, %r603, %r3, %r24;
+ mul.wide.s32 %rd94, %r323, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r33, %r604, 1;
- setp.gt.u32 %p89, %r604, 3;
- mov.u32 %r604, %r33;
+ shr.u32 %r32, %r603, 1;
+ setp.gt.u32 %p89, %r603, 3;
+ mov.u32 %r603, %r32;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
@@ -808,26 +807,26 @@
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
- setp.ge.u32 %p94, %r8, %r605;
+ setp.ge.u32 %p94, %r8, %r604;
@%p94 bra $L__BB0_76;
- mad.lo.s32 %r325, %r605, %r3, %r25;
- mul.wide.s32 %rd97, %r325, 8;
+ mad.lo.s32 %r324, %r604, %r3, %r24;
+ mul.wide.s32 %rd97, %r324, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
- shr.u32 %r35, %r605, 1;
- setp.gt.u32 %p95, %r605, 3;
- mov.u32 %r605, %r35;
+ shr.u32 %r34, %r604, 1;
+ setp.gt.u32 %p95, %r604, 3;
+ mov.u32 %r604, %r34;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
@@ -844,328 +843,328 @@
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
- mov.u32 %r334, %ctaid.y;
- mad.lo.s32 %r335, %r108, %r334, %r7;
- mul.wide.s32 %rd102, %r335, 8;
+ mov.u32 %r333, %ctaid.y;
+ mad.lo.s32 %r334, %r107, %r333, %r7;
+ mul.wide.s32 %rd102, %r334, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
- mov.b64 {%r326, %r327}, %rd103;
+ mov.b64 {%r325, %r326}, %rd103;
mov.b64 %rd104, %fd381;
- mov.b64 {%r328, %r329}, %rd104;
-
- st.volatile.global.v4.s32 [%rd100], {%r326,%r327,%r328,%r329};
+ mov.b64 {%r327, %r328}, %rd104;
+
+ st.volatile.global.v4.s32 [%rd100], {%r325,%r326,%r327,%r328};
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
- mov.b64 {%r330, %r331}, %rd105;
+ mov.b64 {%r329, %r330}, %rd105;
mov.b64 %rd106, %fd383;
- mov.b64 {%r332, %r333}, %rd106;
-
- st.volatile.global.v4.s32 [%rd101], {%r330,%r331,%r332,%r333};
+ mov.b64 {%r331, %r332}, %rd106;
+
+ st.volatile.global.v4.s32 [%rd101], {%r329,%r330,%r331,%r332};
$L__BB0_82:
- mov.u32 %r36, %ctaid.y;
+ mov.u32 %r35, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r336, %r6, %r8;
- or.b32 %r338, %r336, %r311;
- setp.ne.s32 %p98, %r338, 0;
+ or.b32 %r335, %r6, %r8;
+ or.b32 %r337, %r335, %r310;
+ setp.ne.s32 %p98, %r337, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
- mov.u32 %r339, %ctaid.x;
- mov.u32 %r340, %ctaid.z;
- mov.u32 %r341, %nctaid.x;
- mad.lo.s32 %r342, %r340, %r341, %r339;
- mul.wide.s32 %rd108, %r342, 8;
+ mov.u32 %r338, %ctaid.x;
+ mov.u32 %r339, %ctaid.z;
+ mov.u32 %r340, %nctaid.x;
+ mad.lo.s32 %r341, %r339, %r340, %r338;
+ mul.wide.s32 %rd108, %r341, 8;
add.s64 %rd27, %rd107, %rd108;
- add.s32 %r343, %r9, -1;
- setp.eq.s32 %p99, %r36, %r343;
+ add.s32 %r342, %r9, -1;
+ setp.eq.s32 %p99, %r35, %r342;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
- mov.u32 %r606, 8;
+ mov.u32 %r605, 8;
$L__BB0_85:
- nanosleep.u32 %r606;
-
- setp.lt.u32 %p101, %r606, 256;
- selp.u32 %r346, 1, 0, %p101;
- shl.b32 %r606, %r606, %r346;
+ nanosleep.u32 %r605;
+
+ setp.lt.u32 %p101, %r605, 256;
+ selp.u32 %r345, 1, 0, %p101;
+ shl.b32 %r605, %r605, %r345;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
- add.s32 %r347, %r4, %r2;
- add.s32 %r348, %r347, -1;
- div.s32 %r349, %r348, %r4;
- add.s32 %r350, %r9, %r349;
- add.s32 %r351, %r350, -1;
- div.s32 %r39, %r351, %r9;
- setp.lt.s32 %p103, %r39, 1;
+ add.s32 %r346, %r4, %r2;
+ add.s32 %r347, %r346, -1;
+ div.s32 %r348, %r347, %r4;
+ add.s32 %r349, %r9, %r348;
+ add.s32 %r350, %r349, -1;
+ div.s32 %r38, %r350, %r9;
+ setp.lt.s32 %p103, %r38, 1;
@%p103 bra $L__BB0_133;
- add.s32 %r353, %r9, %r3;
- add.s32 %r354, %r353, -1;
- shl.b32 %r40, %r8, 1;
- shl.b32 %r355, %r4, 1;
- mad.lo.s32 %r43, %r355, %r36, %r40;
- or.b32 %r41, %r43, 1;
- mul.lo.s32 %r42, %r355, %r9;
- shr.u32 %r44, %r3, 5;
- mul.lo.s32 %r356, %r24, %r44;
- shr.u32 %r45, %r6, 5;
- add.s32 %r357, %r356, %r45;
- mul.wide.u32 %rd117, %r357, 8;
+ add.s32 %r352, %r9, %r3;
+ add.s32 %r353, %r352, -1;
+ shl.b32 %r39, %r8, 1;
+ shl.b32 %r354, %r4, 1;
+ mad.lo.s32 %r42, %r354, %r35, %r39;
+ or.b32 %r40, %r42, 1;
+ mul.lo.s32 %r41, %r354, %r9;
+ shr.u32 %r43, %r3, 5;
+ mul.lo.s32 %r355, %r23, %r43;
+ shr.u32 %r44, %r6, 5;
+ add.s32 %r356, %r355, %r44;
+ mul.wide.u32 %rd117, %r356, 8;
add.s64 %rd29, %rd44, %rd117;
- and.b32 %r46, %r6, 31;
- add.s32 %r358, %r356, %r46;
- mul.wide.u32 %rd119, %r358, 8;
+ and.b32 %r45, %r6, 31;
+ add.s32 %r357, %r355, %r45;
+ mul.wide.u32 %rd119, %r357, 8;
add.s64 %rd30, %rd44, %rd119;
- div.s32 %r47, %r354, %r3;
- mov.u32 %r607, 0;
+ div.s32 %r46, %r353, %r3;
+ mov.u32 %r606, 0;
$L__BB0_88:
.pragma "nounroll";
- setp.lt.s32 %p104, %r47, 1;
+ setp.lt.s32 %p104, %r46, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
- mul.lo.s32 %r360, %r42, %r607;
- add.s32 %r49, %r41, %r360;
- add.s32 %r50, %r43, %r360;
- mov.u32 %r608, 0;
+ mul.lo.s32 %r359, %r41, %r606;
+ add.s32 %r48, %r40, %r359;
+ add.s32 %r49, %r42, %r359;
+ mov.u32 %r607, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
- setp.ge.s32 %p105, %r49, %r108;
+ setp.ge.s32 %p105, %r48, %r107;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
- mad.lo.s32 %r52, %r608, %r3, %r6;
- setp.ge.s32 %p106, %r52, %r9;
+ mad.lo.s32 %r51, %r607, %r3, %r6;
+ setp.ge.s32 %p106, %r51, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
- mad.lo.s32 %r365, %r52, %r108, %r50;
- mul.wide.s32 %rd121, %r365, 8;
+ mad.lo.s32 %r364, %r51, %r107, %r49;
+ mul.wide.s32 %rd121, %r364, 8;
add.s64 %rd120, %rd42, %rd121;
- ld.volatile.global.v4.s32 {%r361,%r362,%r363,%r364}, [%rd120];
-
- mov.b64 %rd122, {%r361, %r362};
+ ld.volatile.global.v4.s32 {%r360,%r361,%r362,%r363}, [%rd120];
+
+ mov.b64 %rd122, {%r360, %r361};
mov.b64 %fd387, %rd122;
- mov.b64 %rd123, {%r363, %r364};
+ mov.b64 %rd123, {%r362, %r363};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
- add.s32 %r608, %r608, 1;
- setp.lt.s32 %p107, %r608, %r47;
+ add.s32 %r607, %r607, 1;
+ setp.lt.s32 %p107, %r607, %r46;
@%p107 bra $L__BB0_90;
$L__BB0_94:
- mov.b64 {%r366,%r367}, %fd389;
-
- mov.u32 %r386, 31;
- mov.u32 %r387, 16;
- mov.u32 %r388, -1;
- shfl.sync.bfly.b32 %r369|%p108, %r367, %r387, %r386, %r388;
- shfl.sync.bfly.b32 %r368|%p109, %r366, %r387, %r386, %r388;
-
- mov.b64 %fd257, {%r368,%r369};
+ mov.b64 {%r365,%r366}, %fd389;
+
+ mov.u32 %r385, 31;
+ mov.u32 %r386, 16;
+ mov.u32 %r387, -1;
+ shfl.sync.bfly.b32 %r368|%p108, %r366, %r386, %r385, %r387;
+ shfl.sync.bfly.b32 %r367|%p109, %r365, %r386, %r385, %r387;
+
+ mov.b64 %fd257, {%r367,%r368};
add.f64 %fd258, %fd389, %fd257;
- mov.b64 {%r370,%r371}, %fd258;
-
- mov.u32 %r389, 8;
- shfl.sync.bfly.b32 %r373|%p110, %r371, %r389, %r386, %r388;
- shfl.sync.bfly.b32 %r372|%p111, %r370, %r389, %r386, %r388;
-
- mov.b64 %fd259, {%r372,%r373};
+ mov.b64 {%r369,%r370}, %fd258;
+
+ mov.u32 %r388, 8;
+ shfl.sync.bfly.b32 %r372|%p110, %r370, %r388, %r385, %r387;
+ shfl.sync.bfly.b32 %r371|%p111, %r369, %r388, %r385, %r387;
+
+ mov.b64 %fd259, {%r371,%r372};
add.f64 %fd260, %fd258, %fd259;
- mov.b64 {%r374,%r375}, %fd260;
-
- mov.u32 %r390, 4;
- shfl.sync.bfly.b32 %r377|%p112, %r375, %r390, %r386, %r388;
- shfl.sync.bfly.b32 %r376|%p113, %r374, %r390, %r386, %r388;
-
- mov.b64 %fd261, {%r376,%r377};
+ mov.b64 {%r373,%r374}, %fd260;
+
+ mov.u32 %r389, 4;
+ shfl.sync.bfly.b32 %r376|%p112, %r374, %r389, %r385, %r387;
+ shfl.sync.bfly.b32 %r375|%p113, %r373, %r389, %r385, %r387;
+
+ mov.b64 %fd261, {%r375,%r376};
add.f64 %fd262, %fd260, %fd261;
- mov.b64 {%r378,%r379}, %fd262;
-
- mov.u32 %r391, 2;
- shfl.sync.bfly.b32 %r381|%p114, %r379, %r391, %r386, %r388;
- shfl.sync.bfly.b32 %r380|%p115, %r378, %r391, %r386, %r388;
-
- mov.b64 %fd263, {%r380,%r381};
+ mov.b64 {%r377,%r378}, %fd262;
+
+ mov.u32 %r390, 2;
+ shfl.sync.bfly.b32 %r380|%p114, %r378, %r390, %r385, %r387;
+ shfl.sync.bfly.b32 %r379|%p115, %r377, %r390, %r385, %r387;
+
+ mov.b64 %fd263, {%r379,%r380};
add.f64 %fd264, %fd262, %fd263;
- mov.b64 {%r382,%r383}, %fd264;
-
- mov.u32 %r392, 1;
- shfl.sync.bfly.b32 %r385|%p116, %r383, %r392, %r386, %r388;
- shfl.sync.bfly.b32 %r384|%p117, %r382, %r392, %r386, %r388;
-
- mov.b64 %fd265, {%r384,%r385};
+ mov.b64 {%r381,%r382}, %fd264;
+
+ mov.u32 %r391, 1;
+ shfl.sync.bfly.b32 %r384|%p116, %r382, %r391, %r385, %r387;
+ shfl.sync.bfly.b32 %r383|%p117, %r381, %r391, %r385, %r387;
+
+ mov.b64 %fd265, {%r383,%r384};
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
- setp.ne.s32 %p118, %r46, 0;
+ setp.ne.s32 %p118, %r45, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
- setp.ne.s32 %p119, %r45, 0;
+ setp.ne.s32 %p119, %r44, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
- setp.ge.u32 %p120, %r46, %r44;
+ setp.ge.u32 %p120, %r45, %r43;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
- mov.b64 {%r393,%r394}, %fd390;
-
- mov.u32 %r413, 31;
- mov.u32 %r414, 16;
- mov.u32 %r415, -1;
- shfl.sync.bfly.b32 %r396|%p121, %r394, %r414, %r413, %r415;
- shfl.sync.bfly.b32 %r395|%p122, %r393, %r414, %r413, %r415;
-
- mov.b64 %fd268, {%r395,%r396};
+ mov.b64 {%r392,%r393}, %fd390;
+
+ mov.u32 %r412, 31;
+ mov.u32 %r413, 16;
+ mov.u32 %r414, -1;
+ shfl.sync.bfly.b32 %r395|%p121, %r393, %r413, %r412, %r414;
+ shfl.sync.bfly.b32 %r394|%p122, %r392, %r413, %r412, %r414;
+
+ mov.b64 %fd268, {%r394,%r395};
add.f64 %fd269, %fd390, %fd268;
- mov.b64 {%r397,%r398}, %fd269;
-
- mov.u32 %r416, 8;
- shfl.sync.bfly.b32 %r400|%p123, %r398, %r416, %r413, %r415;
- shfl.sync.bfly.b32 %r399|%p124, %r397, %r416, %r413, %r415;
-
- mov.b64 %fd270, {%r399,%r400};
+ mov.b64 {%r396,%r397}, %fd269;
+
+ mov.u32 %r415, 8;
+ shfl.sync.bfly.b32 %r399|%p123, %r397, %r415, %r412, %r414;
+ shfl.sync.bfly.b32 %r398|%p124, %r396, %r415, %r412, %r414;
+
+ mov.b64 %fd270, {%r398,%r399};
add.f64 %fd271, %fd269, %fd270;
- mov.b64 {%r401,%r402}, %fd271;
-
- mov.u32 %r417, 4;
- shfl.sync.bfly.b32 %r404|%p125, %r402, %r417, %r413, %r415;
- shfl.sync.bfly.b32 %r403|%p126, %r401, %r417, %r413, %r415;
-
- mov.b64 %fd272, {%r403,%r404};
+ mov.b64 {%r400,%r401}, %fd271;
+
+ mov.u32 %r416, 4;
+ shfl.sync.bfly.b32 %r403|%p125, %r401, %r416, %r412, %r414;
+ shfl.sync.bfly.b32 %r402|%p126, %r400, %r416, %r412, %r414;
+
+ mov.b64 %fd272, {%r402,%r403};
add.f64 %fd273, %fd271, %fd272;
- mov.b64 {%r405,%r406}, %fd273;
-
- mov.u32 %r418, 2;
- shfl.sync.bfly.b32 %r408|%p127, %r406, %r418, %r413, %r415;
- shfl.sync.bfly.b32 %r407|%p128, %r405, %r418, %r413, %r415;
-
- mov.b64 %fd274, {%r407,%r408};
+ mov.b64 {%r404,%r405}, %fd273;
+
+ mov.u32 %r417, 2;
+ shfl.sync.bfly.b32 %r407|%p127, %r405, %r417, %r412, %r414;
+ shfl.sync.bfly.b32 %r406|%p128, %r404, %r417, %r412, %r414;
+
+ mov.b64 %fd274, {%r406,%r407};
add.f64 %fd275, %fd273, %fd274;
- mov.b64 {%r409,%r410}, %fd275;
-
- mov.u32 %r419, 1;
- shfl.sync.bfly.b32 %r412|%p129, %r410, %r419, %r413, %r415;
- shfl.sync.bfly.b32 %r411|%p130, %r409, %r419, %r413, %r415;
-
- mov.b64 %fd276, {%r411,%r412};
+ mov.b64 {%r408,%r409}, %fd275;
+
+ mov.u32 %r418, 1;
+ shfl.sync.bfly.b32 %r411|%p129, %r409, %r418, %r412, %r414;
+ shfl.sync.bfly.b32 %r410|%p130, %r408, %r418, %r412, %r414;
+
+ mov.b64 %fd276, {%r410,%r411};
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
- setp.eq.s32 %p132, %r46, 0;
+ setp.eq.s32 %p132, %r45, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
- mov.b64 {%r420,%r421}, %fd388;
-
- mov.u32 %r440, 31;
- mov.u32 %r441, 16;
- mov.u32 %r442, -1;
- shfl.sync.bfly.b32 %r423|%p133, %r421, %r441, %r440, %r442;
- shfl.sync.bfly.b32 %r422|%p134, %r420, %r441, %r440, %r442;
-
- mov.b64 %fd278, {%r422,%r423};
+ mov.b64 {%r419,%r420}, %fd388;
+
+ mov.u32 %r439, 31;
+ mov.u32 %r440, 16;
+ mov.u32 %r441, -1;
+ shfl.sync.bfly.b32 %r422|%p133, %r420, %r440, %r439, %r441;
+ shfl.sync.bfly.b32 %r421|%p134, %r419, %r440, %r439, %r441;
+
+ mov.b64 %fd278, {%r421,%r422};
add.f64 %fd279, %fd388, %fd278;
- mov.b64 {%r424,%r425}, %fd279;
-
- mov.u32 %r443, 8;
- shfl.sync.bfly.b32 %r427|%p135, %r425, %r443, %r440, %r442;
- shfl.sync.bfly.b32 %r426|%p136, %r424, %r443, %r440, %r442;
-
- mov.b64 %fd280, {%r426,%r427};
+ mov.b64 {%r423,%r424}, %fd279;
+
+ mov.u32 %r442, 8;
+ shfl.sync.bfly.b32 %r426|%p135, %r424, %r442, %r439, %r441;
+ shfl.sync.bfly.b32 %r425|%p136, %r423, %r442, %r439, %r441;
+
+ mov.b64 %fd280, {%r425,%r426};
add.f64 %fd281, %fd279, %fd280;
- mov.b64 {%r428,%r429}, %fd281;
-
- mov.u32 %r444, 4;
- shfl.sync.bfly.b32 %r431|%p137, %r429, %r444, %r440, %r442;
- shfl.sync.bfly.b32 %r430|%p138, %r428, %r444, %r440, %r442;
-
- mov.b64 %fd282, {%r430,%r431};
+ mov.b64 {%r427,%r428}, %fd281;
+
+ mov.u32 %r443, 4;
+ shfl.sync.bfly.b32 %r430|%p137, %r428, %r443, %r439, %r441;
+ shfl.sync.bfly.b32 %r429|%p138, %r427, %r443, %r439, %r441;
+
+ mov.b64 %fd282, {%r429,%r430};
add.f64 %fd283, %fd281, %fd282;
- mov.b64 {%r432,%r433}, %fd283;
-
- mov.u32 %r445, 2;
- shfl.sync.bfly.b32 %r435|%p139, %r433, %r445, %r440, %r442;
- shfl.sync.bfly.b32 %r434|%p140, %r432, %r445, %r440, %r442;
-
- mov.b64 %fd284, {%r434,%r435};
+ mov.b64 {%r431,%r432}, %fd283;
+
+ mov.u32 %r444, 2;
+ shfl.sync.bfly.b32 %r434|%p139, %r432, %r444, %r439, %r441;
+ shfl.sync.bfly.b32 %r433|%p140, %r431, %r444, %r439, %r441;
+
+ mov.b64 %fd284, {%r433,%r434};
add.f64 %fd285, %fd283, %fd284;
- mov.b64 {%r436,%r437}, %fd285;
-
- mov.u32 %r446, 1;
- shfl.sync.bfly.b32 %r439|%p141, %r437, %r446, %r440, %r442;
- shfl.sync.bfly.b32 %r438|%p142, %r436, %r446, %r440, %r442;
-
- mov.b64 %fd286, {%r438,%r439};
+ mov.b64 {%r435,%r436}, %fd285;
+
+ mov.u32 %r445, 1;
+ shfl.sync.bfly.b32 %r438|%p141, %r436, %r445, %r439, %r441;
+ shfl.sync.bfly.b32 %r437|%p142, %r435, %r445, %r439, %r441;
+
+ mov.b64 %fd286, {%r437,%r438};
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
@@ -1173,201 +1172,201 @@
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
- setp.ge.u32 %p144, %r46, %r44;
+ setp.ge.u32 %p144, %r45, %r43;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
- mov.b64 {%r447,%r448}, %fd392;
-
- mov.u32 %r467, 31;
- mov.u32 %r468, 16;
- mov.u32 %r469, -1;
- shfl.sync.bfly.b32 %r450|%p145, %r448, %r468, %r467, %r469;
- shfl.sync.bfly.b32 %r449|%p146, %r447, %r468, %r467, %r469;
-
- mov.b64 %fd290, {%r449,%r450};
+ mov.b64 {%r446,%r447}, %fd392;
+
+ mov.u32 %r466, 31;
+ mov.u32 %r467, 16;
+ mov.u32 %r468, -1;
+ shfl.sync.bfly.b32 %r449|%p145, %r447, %r467, %r466, %r468;
+ shfl.sync.bfly.b32 %r448|%p146, %r446, %r467, %r466, %r468;
+
+ mov.b64 %fd290, {%r448,%r449};
add.f64 %fd291, %fd392, %fd290;
- mov.b64 {%r451,%r452}, %fd291;
-
- mov.u32 %r470, 8;
- shfl.sync.bfly.b32 %r454|%p147, %r452, %r470, %r467, %r469;
- shfl.sync.bfly.b32 %r453|%p148, %r451, %r470, %r467, %r469;
-
- mov.b64 %fd292, {%r453,%r454};
+ mov.b64 {%r450,%r451}, %fd291;
+
+ mov.u32 %r469, 8;
+ shfl.sync.bfly.b32 %r453|%p147, %r451, %r469, %r466, %r468;
+ shfl.sync.bfly.b32 %r452|%p148, %r450, %r469, %r466, %r468;
+
+ mov.b64 %fd292, {%r452,%r453};
add.f64 %fd293, %fd291, %fd292;
- mov.b64 {%r455,%r456}, %fd293;
-
- mov.u32 %r471, 4;
- shfl.sync.bfly.b32 %r458|%p149, %r456, %r471, %r467, %r469;
- shfl.sync.bfly.b32 %r457|%p150, %r455, %r471, %r467, %r469;
-
- mov.b64 %fd294, {%r457,%r458};
+ mov.b64 {%r454,%r455}, %fd293;
+
+ mov.u32 %r470, 4;
+ shfl.sync.bfly.b32 %r457|%p149, %r455, %r470, %r466, %r468;
+ shfl.sync.bfly.b32 %r456|%p150, %r454, %r470, %r466, %r468;
+
+ mov.b64 %fd294, {%r456,%r457};
add.f64 %fd295, %fd293, %fd294;
- mov.b64 {%r459,%r460}, %fd295;
-
- mov.u32 %r472, 2;
- shfl.sync.bfly.b32 %r462|%p151, %r460, %r472, %r467, %r469;
- shfl.sync.bfly.b32 %r461|%p152, %r459, %r472, %r467, %r469;
-
- mov.b64 %fd296, {%r461,%r462};
+ mov.b64 {%r458,%r459}, %fd295;
+
+ mov.u32 %r471, 2;
+ shfl.sync.bfly.b32 %r461|%p151, %r459, %r471, %r466, %r468;
+ shfl.sync.bfly.b32 %r460|%p152, %r458, %r471, %r466, %r468;
+
+ mov.b64 %fd296, {%r460,%r461};
add.f64 %fd297, %fd295, %fd296;
- mov.b64 {%r463,%r464}, %fd297;
-
- mov.u32 %r473, 1;
- shfl.sync.bfly.b32 %r466|%p153, %r464, %r473, %r467, %r469;
- shfl.sync.bfly.b32 %r465|%p154, %r463, %r473, %r467, %r469;
-
- mov.b64 %fd298, {%r465,%r466};
+ mov.b64 {%r462,%r463}, %fd297;
+
+ mov.u32 %r472, 1;
+ shfl.sync.bfly.b32 %r465|%p153, %r463, %r472, %r466, %r468;
+ shfl.sync.bfly.b32 %r464|%p154, %r462, %r472, %r466, %r468;
+
+ mov.b64 %fd298, {%r464,%r465};
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
- mul.lo.s32 %r54, %r42, %r607;
- add.s32 %r474, %r41, %r54;
- setp.ge.s32 %p156, %r474, %r108;
+ mul.lo.s32 %r53, %r41, %r606;
+ add.s32 %r473, %r40, %r53;
+ setp.ge.s32 %p156, %r473, %r107;
@%p156 bra $L__BB0_109;
- add.s32 %r479, %r43, %r54;
- mul.wide.s32 %rd125, %r479, 8;
+ add.s32 %r478, %r42, %r53;
+ mul.wide.s32 %rd125, %r478, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
- mov.b64 {%r475, %r476}, %rd126;
+ mov.b64 {%r474, %r475}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
- mov.b64 {%r477, %r478}, %rd127;
-
- st.global.cs.v4.s32 [%rd124], {%r475,%r476,%r477,%r478};
+ mov.b64 {%r476, %r477}, %rd127;
+
+ st.global.cs.v4.s32 [%rd124], {%r474,%r475,%r476,%r477};
$L__BB0_109:
- add.s32 %r607, %r607, 1;
- setp.lt.s32 %p158, %r607, %r39;
+ add.s32 %r606, %r606, 1;
+ setp.lt.s32 %p158, %r606, %r38;
@%p158 bra $L__BB0_88;
- mad.lo.s32 %r56, %r108, %r6, %r40;
- shl.b32 %r57, %r36, 1;
- shl.b32 %r58, %r9, 1;
- mul.lo.s32 %r59, %r108, %r3;
- mov.u32 %r609, 0;
+ mad.lo.s32 %r55, %r107, %r6, %r39;
+ shl.b32 %r56, %r35, 1;
+ shl.b32 %r57, %r9, 1;
+ mul.lo.s32 %r58, %r107, %r3;
+ mov.u32 %r608, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
- mad.lo.s32 %r61, %r42, %r609, %r41;
- mad.lo.s32 %r482, %r58, %r609, %r57;
- mad.lo.s32 %r611, %r4, %r482, %r56;
- mov.u32 %r612, 0;
+ mad.lo.s32 %r60, %r41, %r608, %r40;
+ mad.lo.s32 %r481, %r57, %r608, %r56;
+ mad.lo.s32 %r610, %r4, %r481, %r55;
+ mov.u32 %r611, 0;
mov.f64 %fd304, 0d0000000000000000;
- mov.u32 %r610, %r6;
+ mov.u32 %r609, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
- setp.ge.s32 %p160, %r61, %r108;
+ setp.ge.s32 %p160, %r60, %r107;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
- setp.ge.s32 %p161, %r610, %r9;
+ setp.ge.s32 %p161, %r609, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
- mul.wide.s32 %rd129, %r611, 8;
+ mul.wide.s32 %rd129, %r610, 8;
add.s64 %rd128, %rd41, %rd129;
- ld.volatile.global.v4.s32 {%r483,%r484,%r485,%r486}, [%rd128];
-
- mov.b64 %rd130, {%r483, %r484};
+ ld.volatile.global.v4.s32 {%r482,%r483,%r484,%r485}, [%rd128];
+
+ mov.b64 %rd130, {%r482, %r483};
mov.b64 %fd397, %rd130;
- mov.b64 %rd131, {%r485, %r486};
+ mov.b64 %rd131, {%r484, %r485};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
- add.s32 %r611, %r611, %r59;
- add.s32 %r610, %r610, %r3;
- add.s32 %r612, %r612, 1;
- setp.lt.s32 %p162, %r612, %r47;
+ add.s32 %r610, %r610, %r58;
+ add.s32 %r609, %r609, %r3;
+ add.s32 %r611, %r611, 1;
+ setp.lt.s32 %p162, %r611, %r46;
@%p162 bra $L__BB0_113;
$L__BB0_117:
- mov.b64 {%r487,%r488}, %fd399;
-
- mov.u32 %r507, 31;
- mov.u32 %r508, 16;
- mov.u32 %r509, -1;
- shfl.sync.bfly.b32 %r490|%p163, %r488, %r508, %r507, %r509;
- shfl.sync.bfly.b32 %r489|%p164, %r487, %r508, %r507, %r509;
-
- mov.b64 %fd310, {%r489,%r490};
+ mov.b64 {%r486,%r487}, %fd399;
+
+ mov.u32 %r506, 31;
+ mov.u32 %r507, 16;
+ mov.u32 %r508, -1;
+ shfl.sync.bfly.b32 %r489|%p163, %r487, %r507, %r506, %r508;
+ shfl.sync.bfly.b32 %r488|%p164, %r486, %r507, %r506, %r508;
+
+ mov.b64 %fd310, {%r488,%r489};
add.f64 %fd311, %fd399, %fd310;
- mov.b64 {%r491,%r492}, %fd311;
-
- mov.u32 %r510, 8;
- shfl.sync.bfly.b32 %r494|%p165, %r492, %r510, %r507, %r509;
- shfl.sync.bfly.b32 %r493|%p166, %r491, %r510, %r507, %r509;
-
- mov.b64 %fd312, {%r493,%r494};
+ mov.b64 {%r490,%r491}, %fd311;
+
+ mov.u32 %r509, 8;
+ shfl.sync.bfly.b32 %r493|%p165, %r491, %r509, %r506, %r508;
+ shfl.sync.bfly.b32 %r492|%p166, %r490, %r509, %r506, %r508;
+
+ mov.b64 %fd312, {%r492,%r493};
add.f64 %fd313, %fd311, %fd312;
- mov.b64 {%r495,%r496}, %fd313;
-
- mov.u32 %r511, 4;
- shfl.sync.bfly.b32 %r498|%p167, %r496, %r511, %r507, %r509;
- shfl.sync.bfly.b32 %r497|%p168, %r495, %r511, %r507, %r509;
-
- mov.b64 %fd314, {%r497,%r498};
+ mov.b64 {%r494,%r495}, %fd313;
+
+ mov.u32 %r510, 4;
+ shfl.sync.bfly.b32 %r497|%p167, %r495, %r510, %r506, %r508;
+ shfl.sync.bfly.b32 %r496|%p168, %r494, %r510, %r506, %r508;
+
+ mov.b64 %fd314, {%r496,%r497};
add.f64 %fd315, %fd313, %fd314;
- mov.b64 {%r499,%r500}, %fd315;
-
- mov.u32 %r512, 2;
- shfl.sync.bfly.b32 %r502|%p169, %r500, %r512, %r507, %r509;
- shfl.sync.bfly.b32 %r501|%p170, %r499, %r512, %r507, %r509;
-
- mov.b64 %fd316, {%r501,%r502};
+ mov.b64 {%r498,%r499}, %fd315;
+
+ mov.u32 %r511, 2;
+ shfl.sync.bfly.b32 %r501|%p169, %r499, %r511, %r506, %r508;
+ shfl.sync.bfly.b32 %r500|%p170, %r498, %r511, %r506, %r508;
+
+ mov.b64 %fd316, {%r500,%r501};
add.f64 %fd317, %fd315, %fd316;
- mov.b64 {%r503,%r504}, %fd317;
-
- mov.u32 %r513, 1;
- shfl.sync.bfly.b32 %r506|%p171, %r504, %r513, %r507, %r509;
- shfl.sync.bfly.b32 %r505|%p172, %r503, %r513, %r507, %r509;
-
- mov.b64 %fd318, {%r505,%r506};
+ mov.b64 {%r502,%r503}, %fd317;
+
+ mov.u32 %r512, 1;
+ shfl.sync.bfly.b32 %r505|%p171, %r503, %r512, %r506, %r508;
+ shfl.sync.bfly.b32 %r504|%p172, %r502, %r512, %r506, %r508;
+
+ mov.b64 %fd318, {%r504,%r505};
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
@@ -1375,124 +1374,124 @@
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
- setp.ge.u32 %p175, %r46, %r44;
+ setp.ge.u32 %p175, %r45, %r43;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
- mov.b64 {%r514,%r515}, %fd400;
-
- mov.u32 %r534, 31;
- mov.u32 %r535, 16;
- mov.u32 %r536, -1;
- shfl.sync.bfly.b32 %r517|%p176, %r515, %r535, %r534, %r536;
- shfl.sync.bfly.b32 %r516|%p177, %r514, %r535, %r534, %r536;
-
- mov.b64 %fd321, {%r516,%r517};
+ mov.b64 {%r513,%r514}, %fd400;
+
+ mov.u32 %r533, 31;
+ mov.u32 %r534, 16;
+ mov.u32 %r535, -1;
+ shfl.sync.bfly.b32 %r516|%p176, %r514, %r534, %r533, %r535;
+ shfl.sync.bfly.b32 %r515|%p177, %r513, %r534, %r533, %r535;
+
+ mov.b64 %fd321, {%r515,%r516};
add.f64 %fd322, %fd400, %fd321;
- mov.b64 {%r518,%r519}, %fd322;
-
- mov.u32 %r537, 8;
- shfl.sync.bfly.b32 %r521|%p178, %r519, %r537, %r534, %r536;
- shfl.sync.bfly.b32 %r520|%p179, %r518, %r537, %r534, %r536;
-
- mov.b64 %fd323, {%r520,%r521};
+ mov.b64 {%r517,%r518}, %fd322;
+
+ mov.u32 %r536, 8;
+ shfl.sync.bfly.b32 %r520|%p178, %r518, %r536, %r533, %r535;
+ shfl.sync.bfly.b32 %r519|%p179, %r517, %r536, %r533, %r535;
+
+ mov.b64 %fd323, {%r519,%r520};
add.f64 %fd324, %fd322, %fd323;
- mov.b64 {%r522,%r523}, %fd324;
-
- mov.u32 %r538, 4;
- shfl.sync.bfly.b32 %r525|%p180, %r523, %r538, %r534, %r536;
- shfl.sync.bfly.b32 %r524|%p181, %r522, %r538, %r534, %r536;
-
- mov.b64 %fd325, {%r524,%r525};
+ mov.b64 {%r521,%r522}, %fd324;
+
+ mov.u32 %r537, 4;
+ shfl.sync.bfly.b32 %r524|%p180, %r522, %r537, %r533, %r535;
+ shfl.sync.bfly.b32 %r523|%p181, %r521, %r537, %r533, %r535;
+
+ mov.b64 %fd325, {%r523,%r524};
add.f64 %fd326, %fd324, %fd325;
- mov.b64 {%r526,%r527}, %fd326;
-
- mov.u32 %r539, 2;
- shfl.sync.bfly.b32 %r529|%p182, %r527, %r539, %r534, %r536;
- shfl.sync.bfly.b32 %r528|%p183, %r526, %r539, %r534, %r536;
-
- mov.b64 %fd327, {%r528,%r529};
+ mov.b64 {%r525,%r526}, %fd326;
+
+ mov.u32 %r538, 2;
+ shfl.sync.bfly.b32 %r528|%p182, %r526, %r538, %r533, %r535;
+ shfl.sync.bfly.b32 %r527|%p183, %r525, %r538, %r533, %r535;
+
+ mov.b64 %fd327, {%r527,%r528};
add.f64 %fd328, %fd326, %fd327;
- mov.b64 {%r530,%r531}, %fd328;
-
- mov.u32 %r540, 1;
- shfl.sync.bfly.b32 %r533|%p184, %r531, %r540, %r534, %r536;
- shfl.sync.bfly.b32 %r532|%p185, %r530, %r540, %r534, %r536;
-
- mov.b64 %fd329, {%r532,%r533};
+ mov.b64 {%r529,%r530}, %fd328;
+
+ mov.u32 %r539, 1;
+ shfl.sync.bfly.b32 %r532|%p184, %r530, %r539, %r533, %r535;
+ shfl.sync.bfly.b32 %r531|%p185, %r529, %r539, %r533, %r535;
+
+ mov.b64 %fd329, {%r531,%r532};
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
- mov.b64 {%r541,%r542}, %fd398;
-
- mov.u32 %r561, 31;
- mov.u32 %r562, 16;
- mov.u32 %r563, -1;
- shfl.sync.bfly.b32 %r544|%p188, %r542, %r562, %r561, %r563;
- shfl.sync.bfly.b32 %r543|%p189, %r541, %r562, %r561, %r563;
-
- mov.b64 %fd331, {%r543,%r544};
+ mov.b64 {%r540,%r541}, %fd398;
+
+ mov.u32 %r560, 31;
+ mov.u32 %r561, 16;
+ mov.u32 %r562, -1;
+ shfl.sync.bfly.b32 %r543|%p188, %r541, %r561, %r560, %r562;
+ shfl.sync.bfly.b32 %r542|%p189, %r540, %r561, %r560, %r562;
+
+ mov.b64 %fd331, {%r542,%r543};
add.f64 %fd332, %fd398, %fd331;
- mov.b64 {%r545,%r546}, %fd332;
-
- mov.u32 %r564, 8;
- shfl.sync.bfly.b32 %r548|%p190, %r546, %r564, %r561, %r563;
- shfl.sync.bfly.b32 %r547|%p191, %r545, %r564, %r561, %r563;
-
- mov.b64 %fd333, {%r547,%r548};
+ mov.b64 {%r544,%r545}, %fd332;
+
+ mov.u32 %r563, 8;
+ shfl.sync.bfly.b32 %r547|%p190, %r545, %r563, %r560, %r562;
+ shfl.sync.bfly.b32 %r546|%p191, %r544, %r563, %r560, %r562;
+
+ mov.b64 %fd333, {%r546,%r547};
add.f64 %fd334, %fd332, %fd333;
- mov.b64 {%r549,%r550}, %fd334;
-
- mov.u32 %r565, 4;
- shfl.sync.bfly.b32 %r552|%p192, %r550, %r565, %r561, %r563;
- shfl.sync.bfly.b32 %r551|%p193, %r549, %r565, %r561, %r563;
-
- mov.b64 %fd335, {%r551,%r552};
+ mov.b64 {%r548,%r549}, %fd334;
+
+ mov.u32 %r564, 4;
+ shfl.sync.bfly.b32 %r551|%p192, %r549, %r564, %r560, %r562;
+ shfl.sync.bfly.b32 %r550|%p193, %r548, %r564, %r560, %r562;
+
+ mov.b64 %fd335, {%r550,%r551};
add.f64 %fd336, %fd334, %fd335;
- mov.b64 {%r553,%r554}, %fd336;
-
- mov.u32 %r566, 2;
- shfl.sync.bfly.b32 %r556|%p194, %r554, %r566, %r561, %r563;
- shfl.sync.bfly.b32 %r555|%p195, %r553, %r566, %r561, %r563;
-
- mov.b64 %fd337, {%r555,%r556};
+ mov.b64 {%r552,%r553}, %fd336;
+
+ mov.u32 %r565, 2;
+ shfl.sync.bfly.b32 %r555|%p194, %r553, %r565, %r560, %r562;
+ shfl.sync.bfly.b32 %r554|%p195, %r552, %r565, %r560, %r562;
+
+ mov.b64 %fd337, {%r554,%r555};
add.f64 %fd338, %fd336, %fd337;
- mov.b64 {%r557,%r558}, %fd338;
-
- mov.u32 %r567, 1;
- shfl.sync.bfly.b32 %r560|%p196, %r558, %r567, %r561, %r563;
- shfl.sync.bfly.b32 %r559|%p197, %r557, %r567, %r561, %r563;
-
- mov.b64 %fd339, {%r559,%r560};
+ mov.b64 {%r556,%r557}, %fd338;
+
+ mov.u32 %r566, 1;
+ shfl.sync.bfly.b32 %r559|%p196, %r557, %r566, %r560, %r562;
+ shfl.sync.bfly.b32 %r558|%p197, %r556, %r566, %r560, %r562;
+
+ mov.b64 %fd339, {%r558,%r559};
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
@@ -1500,95 +1499,95 @@
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
- setp.ge.u32 %p199, %r46, %r44;
+ setp.ge.u32 %p199, %r45, %r43;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
- mov.b64 {%r568,%r569}, %fd402;
-
- mov.u32 %r588, 31;
- mov.u32 %r589, 16;
- mov.u32 %r590, -1;
- shfl.sync.bfly.b32 %r571|%p200, %r569, %r589, %r588, %r590;
- shfl.sync.bfly.b32 %r570|%p201, %r568, %r589, %r588, %r590;
-
- mov.b64 %fd343, {%r570,%r571};
+ mov.b64 {%r567,%r568}, %fd402;
+
+ mov.u32 %r587, 31;
+ mov.u32 %r588, 16;
+ mov.u32 %r589, -1;
+ shfl.sync.bfly.b32 %r570|%p200, %r568, %r588, %r587, %r589;
+ shfl.sync.bfly.b32 %r569|%p201, %r567, %r588, %r587, %r589;
+
+ mov.b64 %fd343, {%r569,%r570};
add.f64 %fd344, %fd402, %fd343;
- mov.b64 {%r572,%r573}, %fd344;
-
- mov.u32 %r591, 8;
- shfl.sync.bfly.b32 %r575|%p202, %r573, %r591, %r588, %r590;
- shfl.sync.bfly.b32 %r574|%p203, %r572, %r591, %r588, %r590;
-
- mov.b64 %fd345, {%r574,%r575};
+ mov.b64 {%r571,%r572}, %fd344;
+
+ mov.u32 %r590, 8;
+ shfl.sync.bfly.b32 %r574|%p202, %r572, %r590, %r587, %r589;
+ shfl.sync.bfly.b32 %r573|%p203, %r571, %r590, %r587, %r589;
+
+ mov.b64 %fd345, {%r573,%r574};
add.f64 %fd346, %fd344, %fd345;
- mov.b64 {%r576,%r577}, %fd346;
-
- mov.u32 %r592, 4;
- shfl.sync.bfly.b32 %r579|%p204, %r577, %r592, %r588, %r590;
- shfl.sync.bfly.b32 %r578|%p205, %r576, %r592, %r588, %r590;
-
- mov.b64 %fd347, {%r578,%r579};
+ mov.b64 {%r575,%r576}, %fd346;
+
+ mov.u32 %r591, 4;
+ shfl.sync.bfly.b32 %r578|%p204, %r576, %r591, %r587, %r589;
+ shfl.sync.bfly.b32 %r577|%p205, %r575, %r591, %r587, %r589;
+
+ mov.b64 %fd347, {%r577,%r578};
add.f64 %fd348, %fd346, %fd347;
- mov.b64 {%r580,%r581}, %fd348;
-
- mov.u32 %r593, 2;
- shfl.sync.bfly.b32 %r583|%p206, %r581, %r593, %r588, %r590;
- shfl.sync.bfly.b32 %r582|%p207, %r580, %r593, %r588, %r590;
-
- mov.b64 %fd349, {%r582,%r583};
+ mov.b64 {%r579,%r580}, %fd348;
+
+ mov.u32 %r592, 2;
+ shfl.sync.bfly.b32 %r582|%p206, %r580, %r592, %r587, %r589;
+ shfl.sync.bfly.b32 %r581|%p207, %r579, %r592, %r587, %r589;
+
+ mov.b64 %fd349, {%r581,%r582};
add.f64 %fd350, %fd348, %fd349;
- mov.b64 {%r584,%r585}, %fd350;
-
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r587|%p208, %r585, %r594, %r588, %r590;
- shfl.sync.bfly.b32 %r586|%p209, %r584, %r594, %r588, %r590;
-
- mov.b64 %fd351, {%r586,%r587};
+ mov.b64 {%r583,%r584}, %fd350;
+
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r586|%p208, %r584, %r593, %r587, %r589;
+ shfl.sync.bfly.b32 %r585|%p209, %r583, %r593, %r587, %r589;
+
+ mov.b64 %fd351, {%r585,%r586};
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
- mul.lo.s32 %r69, %r42, %r609;
- add.s32 %r595, %r41, %r69;
- setp.ge.s32 %p211, %r595, %r108;
+ mul.lo.s32 %r68, %r41, %r608;
+ add.s32 %r594, %r40, %r68;
+ setp.ge.s32 %p211, %r594, %r107;
@%p211 bra $L__BB0_132;
- add.s32 %r600, %r43, %r69;
- mul.wide.s32 %rd133, %r600, 8;
+ add.s32 %r599, %r42, %r68;
+ mul.wide.s32 %rd133, %r599, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
- mov.b64 {%r596, %r597}, %rd134;
+ mov.b64 {%r595, %r596}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
- mov.b64 {%r598, %r599}, %rd135;
-
- st.global.cs.v4.s32 [%rd132], {%r596,%r597,%r598,%r599};
+ mov.b64 {%r597, %r598}, %rd135;
+
+ st.global.cs.v4.s32 [%rd132], {%r595,%r596,%r597,%r598};
$L__BB0_132:
- add.s32 %r609, %r609, 1;
- setp.lt.s32 %p213, %r609, %r39;
+ add.s32 %r608, %r608, 1;
+ setp.lt.s32 %p213, %r608, %r38;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
5: CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_1024
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 72
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
__global__ void nvfuser_N(Tensor<double, 2, 2> T0, Tensor<double, 2, 2> T1, Tensor<double, 2, 2> T2, Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T4, Tensor<double, 1, 1> T5, Tensor<double, 2, 2> T20, Tensor<double, 1, 1> T22, Tensor<double, 1, 1> T23, Tensor<double, 2, 2> T48, Tensor<double, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 2)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(double), 16);
double* T34 = reinterpret_cast<double*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8)) + 15) & -16));
double* T30 = reinterpret_cast<double*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 2))) * 2) * 8) + 15) & -16));
double* T31 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (2 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<double, 2, 2> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T49[i6] = 0.00000000000000000e+00;
}
Array<double, 2, 2> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T54[i7] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8] = 0.00000000000000000e+00;
}
Array<double, 2, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
T47[i9] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<double, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<double, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<double, 1, 1> T19;
T19[0]
= d5
* T33[0];
Array<double, 1, 1> T11;
T11[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T14;
T14[0] = 0.00000000000000000e+00;
asm volatile("cp.async.wait_all;\n");
Array<double, 1, 1> T46;
T46[0] = 0.00000000000000000e+00;
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
Array<double, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<double, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<double, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<double, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<double, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
Array<double, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
T24[0]
= T42[i11];
Array<double, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<double, 1, 1> T10;
T10[0]
= d4
* T25[0];
Array<double, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<double, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<double, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<double, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, true, double(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T48[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/true>( &*(volatile double*)&T53[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<double, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<double, 2, 2> T55;
T55.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile double*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<double, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<double, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.00000000000000000e+00;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<double, 2, 2> T50;
T50.set(double(0.00000000000000000e+00));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile double*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<double, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.00000000000000000e+00;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](double &a, double b) { a = a + b; }, static_cast<double*>(shared_mem), true, static_cast<double>(0.00000000000000000e+00), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<double, 2, 2> T41;
T41.set(double(0.00000000000000000e+00));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T41[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 2; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((8 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (2 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<double, 1, 1> T57;
T57[0] = 0.00000000000000000e+00;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T40;
T40.set(double(0));
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T45;
T45.set(double(0));
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T43;
T43.set(double(0));
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T39;
T39.set(double(0));
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<double, 2, 2> T40;
T40.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T40[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T45;
T45.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T45[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T43;
T43.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T43[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T39;
T39.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T39[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
Array<double, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<double, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<double*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<double, 2, 2> T44;
T44.set(double(0));
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<double, 2, 2> T42;
T42.set(double(0));
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
Array<double, 2, 2> T38;
T38.set(double(0));
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
Array<double, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[((((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<double, 2, 2> T44;
T44.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T44[0], &T31[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<double, 2, 2> T42;
T42.set(double(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<double, 2>( &T42[0], &T34[(2 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<double, 2, 2> T38;
T38.set(double(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 2))) && ((1 + (2 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<double, 2>( &T38[0], &T30[((2 * ((nvfuser_index_t)threadIdx.x)) + ((2 * (ceilDiv(i2, 2))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103395arrayE[];
.entry _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
.reg .b32 %r<613>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
ld.param.v2.u32 {%r107, %r108}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r117, %r118}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r121, %r122}, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103399nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r143, %r108, 1;
shr.u32 %r144, %r143, 31;
add.s32 %r145, %r143, %r144;
shr.s32 %r2, %r145, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r146, %r2, %r3;
add.s32 %r147, %r146, 31;
shr.s32 %r148, %r147, 31;
shr.u32 %r149, %r148, 27;
add.s32 %r150, %r147, %r149;
shr.u32 %r151, %r150, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r152, %r4, %r151;
shl.b32 %r153, %r152, 8;
cvt.u64.u32 %rd1, %r153;
mul.lo.s32 %r154, %r4, %r2;
shl.b32 %r155, %r154, 4;
or.b32 %r156, %r155, 15;
and.b32 %r5, %r156, -16;
add.s32 %r157, %r156, %r5;
and.b32 %r158, %r157, -16;
cvt.s64.s32 %rd2, %r158;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_3612f883_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
cvt.rn.f64.s32 %fd1, %r108;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r159, %r7, 1;
setp.lt.s32 %p7, %r159, %r108;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r160, smem_ptr; }
// end inline asm
shl.b32 %r163, %r6, 4;
add.s32 %r161, %r160, %r163;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r162, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r162, 0;
cp.async.ca.shared.global [%r161], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r164, %r4, 215;
div.s32 %r165, %r164, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r166, %r9, %r165;
add.s32 %r167, %r166, -1;
div.s32 %r10, %r167, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r169, %ctaid.y;
mul.lo.s32 %r170, %r10, %r4;
mul.lo.s32 %r11, %r170, %r169;
shl.b32 %r171, %r8, 3;
shl.b32 %r172, %r6, 4;
mad.lo.s32 %r12, %r171, %r108, %r172;
mul.lo.s32 %r173, %r108, %r8;
cvt.s64.s32 %rd53, %r173;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r174, %r11, %r108;
cvt.s64.s32 %rd6, %r174;
mul.lo.s32 %r13, %r108, %r4;
mul.lo.s32 %r14, %r10, %r169;
add.s32 %r15, %r173, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r177, %r176, %r16;
shr.u32 %r17, %r6, 5;
add.s32 %r178, %r177, %r17;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r18, %r6, 31;
add.s32 %r179, %r177, %r18;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r176, 8;
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r601, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
// end inline asm
add.s32 %r183, %r12, %r182;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
// end inline asm
add.s32 %r193, %r12, %r192;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r180, %r601, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r185, %r13, %r601;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r184, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r184, 0;
cp.async.ca.shared.global [%r183], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r186, %r14, %r601;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd364, 0d0000000000000000;
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r188, %r14, %r601;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
add.s32 %r190, %r14, %r601;
mad.lo.s32 %r22, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
setp.gt.s32 %p16, %r22, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
mul.lo.s32 %r191, %r22, %r117;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
setp.lt.s32 %p17, %r22, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
mul.lo.s32 %r195, %r13, %r601;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r194, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r194, 0;
cp.async.ca.shared.global [%r193], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
mul.lo.s32 %r196, %r22, %r121;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd371, %fd370;
@%p18 bra $L__BB0_22;
ld.shared.v2.f64 {%fd118, %fd119}, [%rd7];
ld.shared.v2.f64 {%fd122, %fd123}, [%rd9];
ld.shared.v2.f64 {%fd126, %fd127}, [%rd11];
mul.f64 %fd130, %fd126, %fd118;
add.f64 %fd131, %fd130, 0d0000000000000000;
sub.f64 %fd132, %fd122, %fd366;
mul.f64 %fd133, %fd367, %fd132;
fma.rn.f64 %fd134, %fd130, %fd133, 0d0000000000000000;
fma.rn.f64 %fd368, %fd133, %fd118, %fd368;
mul.f64 %fd135, %fd127, %fd119;
add.f64 %fd371, %fd131, %fd135;
sub.f64 %fd136, %fd123, %fd366;
mul.f64 %fd137, %fd367, %fd136;
fma.rn.f64 %fd370, %fd135, %fd137, %fd134;
fma.rn.f64 %fd369, %fd137, %fd119, %fd369;
$L__BB0_22:
// begin inline asm
mov.b64 {%r197,%r198}, %fd371;
// end inline asm
mov.u32 %r217, 31;
mov.u32 %r218, 16;
mov.u32 %r219, -1;
shfl.sync.bfly.b32 %r200|%p21, %r198, %r218, %r217, %r219;
shfl.sync.bfly.b32 %r199|%p22, %r197, %r218, %r217, %r219;
// begin inline asm
mov.b64 %fd139, {%r199,%r200};
// end inline asm
add.f64 %fd140, %fd371, %fd139;
// begin inline asm
mov.b64 {%r201,%r202}, %fd140;
// end inline asm
mov.u32 %r220, 8;
shfl.sync.bfly.b32 %r204|%p23, %r202, %r220, %r217, %r219;
shfl.sync.bfly.b32 %r203|%p24, %r201, %r220, %r217, %r219;
// begin inline asm
mov.b64 %fd141, {%r203,%r204};
// end inline asm
add.f64 %fd142, %fd140, %fd141;
// begin inline asm
mov.b64 {%r205,%r206}, %fd142;
// end inline asm
mov.u32 %r221, 4;
shfl.sync.bfly.b32 %r208|%p25, %r206, %r221, %r217, %r219;
shfl.sync.bfly.b32 %r207|%p26, %r205, %r221, %r217, %r219;
// begin inline asm
mov.b64 %fd143, {%r207,%r208};
// end inline asm
add.f64 %fd144, %fd142, %fd143;
// begin inline asm
mov.b64 {%r209,%r210}, %fd144;
// end inline asm
mov.u32 %r222, 2;
shfl.sync.bfly.b32 %r212|%p27, %r210, %r222, %r217, %r219;
shfl.sync.bfly.b32 %r211|%p28, %r209, %r222, %r217, %r219;
// begin inline asm
mov.b64 %fd145, {%r211,%r212};
// end inline asm
add.f64 %fd146, %fd144, %fd145;
// begin inline asm
mov.b64 {%r213,%r214}, %fd146;
// end inline asm
mov.u32 %r223, 1;
shfl.sync.bfly.b32 %r216|%p29, %r214, %r223, %r217, %r219;
shfl.sync.bfly.b32 %r215|%p30, %r213, %r223, %r217, %r219;
// begin inline asm
mov.b64 %fd147, {%r215,%r216};
// end inline asm
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
setp.ne.s32 %p31, %r18, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
setp.ne.s32 %p32, %r17, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
setp.ge.u32 %p33, %r18, %r16;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
$L__BB0_27:
// begin inline asm
mov.b64 {%r224,%r225}, %fd372;
// end inline asm
mov.u32 %r244, 31;
mov.u32 %r245, 16;
mov.u32 %r246, -1;
shfl.sync.bfly.b32 %r227|%p34, %r225, %r245, %r244, %r246;
shfl.sync.bfly.b32 %r226|%p35, %r224, %r245, %r244, %r246;
// begin inline asm
mov.b64 %fd150, {%r226,%r227};
// end inline asm
add.f64 %fd151, %fd372, %fd150;
// begin inline asm
mov.b64 {%r228,%r229}, %fd151;
// end inline asm
mov.u32 %r247, 8;
shfl.sync.bfly.b32 %r231|%p36, %r229, %r247, %r244, %r246;
shfl.sync.bfly.b32 %r230|%p37, %r228, %r247, %r244, %r246;
// begin inline asm
mov.b64 %fd152, {%r230,%r231};
// end inline asm
add.f64 %fd153, %fd151, %fd152;
// begin inline asm
mov.b64 {%r232,%r233}, %fd153;
// end inline asm
mov.u32 %r248, 4;
shfl.sync.bfly.b32 %r235|%p38, %r233, %r248, %r244, %r246;
shfl.sync.bfly.b32 %r234|%p39, %r232, %r248, %r244, %r246;
// begin inline asm
mov.b64 %fd154, {%r234,%r235};
// end inline asm
add.f64 %fd155, %fd153, %fd154;
// begin inline asm
mov.b64 {%r236,%r237}, %fd155;
// end inline asm
mov.u32 %r249, 2;
shfl.sync.bfly.b32 %r239|%p40, %r237, %r249, %r244, %r246;
shfl.sync.bfly.b32 %r238|%p41, %r236, %r249, %r244, %r246;
// begin inline asm
mov.b64 %fd156, {%r238,%r239};
// end inline asm
add.f64 %fd157, %fd155, %fd156;
// begin inline asm
mov.b64 {%r240,%r241}, %fd157;
// end inline asm
mov.u32 %r250, 1;
shfl.sync.bfly.b32 %r243|%p42, %r241, %r250, %r244, %r246;
shfl.sync.bfly.b32 %r242|%p43, %r240, %r250, %r244, %r246;
// begin inline asm
mov.b64 %fd158, {%r242,%r243};
// end inline asm
add.f64 %fd373, %fd157, %fd158;
$L__BB0_28:
bar.sync 0;
// begin inline asm
mov.b64 {%r251,%r252}, %fd370;
// end inline asm
mov.u32 %r271, 31;
mov.u32 %r272, 16;
mov.u32 %r273, -1;
shfl.sync.bfly.b32 %r254|%p44, %r252, %r272, %r271, %r273;
shfl.sync.bfly.b32 %r253|%p45, %r251, %r272, %r271, %r273;
// begin inline asm
mov.b64 %fd160, {%r253,%r254};
// end inline asm
add.f64 %fd161, %fd370, %fd160;
// begin inline asm
mov.b64 {%r255,%r256}, %fd161;
// end inline asm
mov.u32 %r274, 8;
shfl.sync.bfly.b32 %r258|%p46, %r256, %r274, %r271, %r273;
shfl.sync.bfly.b32 %r257|%p47, %r255, %r274, %r271, %r273;
// begin inline asm
mov.b64 %fd162, {%r257,%r258};
// end inline asm
add.f64 %fd163, %fd161, %fd162;
// begin inline asm
mov.b64 {%r259,%r260}, %fd163;
// end inline asm
mov.u32 %r275, 4;
shfl.sync.bfly.b32 %r262|%p48, %r260, %r275, %r271, %r273;
shfl.sync.bfly.b32 %r261|%p49, %r259, %r275, %r271, %r273;
// begin inline asm
mov.b64 %fd164, {%r261,%r262};
// end inline asm
add.f64 %fd165, %fd163, %fd164;
// begin inline asm
mov.b64 {%r263,%r264}, %fd165;
// end inline asm
mov.u32 %r276, 2;
shfl.sync.bfly.b32 %r266|%p50, %r264, %r276, %r271, %r273;
shfl.sync.bfly.b32 %r265|%p51, %r263, %r276, %r271, %r273;
// begin inline asm
mov.b64 %fd166, {%r265,%r266};
// end inline asm
add.f64 %fd167, %fd165, %fd166;
// begin inline asm
mov.b64 {%r267,%r268}, %fd167;
// end inline asm
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r270|%p52, %r268, %r277, %r271, %r273;
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
// begin inline asm
mov.b64 %fd168, {%r269,%r270};
// end inline asm
add.f64 %fd375, %fd167, %fd168;
setp.eq.s32 %p4, %r18, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
$L__BB0_30:
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
setp.ge.u32 %p56, %r18, %r16;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
$L__BB0_33:
// begin inline asm
mov.b64 {%r278,%r279}, %fd374;
// end inline asm
mov.u32 %r298, 31;
mov.u32 %r299, 16;
mov.u32 %r300, -1;
shfl.sync.bfly.b32 %r281|%p57, %r279, %r299, %r298, %r300;
shfl.sync.bfly.b32 %r280|%p58, %r278, %r299, %r298, %r300;
// begin inline asm
mov.b64 %fd172, {%r280,%r281};
// end inline asm
add.f64 %fd173, %fd374, %fd172;
// begin inline asm
mov.b64 {%r282,%r283}, %fd173;
// end inline asm
mov.u32 %r301, 8;
shfl.sync.bfly.b32 %r285|%p59, %r283, %r301, %r298, %r300;
shfl.sync.bfly.b32 %r284|%p60, %r282, %r301, %r298, %r300;
// begin inline asm
mov.b64 %fd174, {%r284,%r285};
// end inline asm
add.f64 %fd175, %fd173, %fd174;
// begin inline asm
mov.b64 {%r286,%r287}, %fd175;
// end inline asm
mov.u32 %r302, 4;
shfl.sync.bfly.b32 %r289|%p61, %r287, %r302, %r298, %r300;
shfl.sync.bfly.b32 %r288|%p62, %r286, %r302, %r298, %r300;
// begin inline asm
mov.b64 %fd176, {%r288,%r289};
// end inline asm
add.f64 %fd177, %fd175, %fd176;
// begin inline asm
mov.b64 {%r290,%r291}, %fd177;
// end inline asm
mov.u32 %r303, 2;
shfl.sync.bfly.b32 %r293|%p63, %r291, %r303, %r298, %r300;
shfl.sync.bfly.b32 %r292|%p64, %r290, %r303, %r298, %r300;
// begin inline asm
mov.b64 %fd178, {%r292,%r293};
// end inline asm
add.f64 %fd179, %fd177, %fd178;
// begin inline asm
mov.b64 {%r294,%r295}, %fd179;
// end inline asm
mov.u32 %r304, 1;
shfl.sync.bfly.b32 %r297|%p65, %r295, %r304, %r298, %r300;
shfl.sync.bfly.b32 %r296|%p66, %r294, %r304, %r298, %r300;
// begin inline asm
mov.b64 %fd180, {%r296,%r297};
// end inline asm
add.f64 %fd375, %fd179, %fd180;
$L__BB0_34:
bar.sync 0;
setp.ne.s32 %p67, %r6, 0;
@%p67 bra $L__BB0_36;
st.shared.f64 [%rd12], %fd33;
$L__BB0_36:
bar.sync 0;
ld.shared.f64 %fd38, [%rd12];
bar.sync 0;
@%p67 bra $L__BB0_38;
add.f64 %fd181, %fd375, 0d0000000000000000;
selp.f64 %fd182, %fd181, 0d0000000000000000, %p4;
st.shared.f64 [%rd12], %fd182;
$L__BB0_38:
bar.sync 0;
ld.shared.f64 %fd39, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_40;
mul.f64 %fd183, %fd2, %fd367;
ld.shared.v2.f64 {%fd184, %fd185}, [%rd9];
ld.shared.v2.f64 {%fd188, %fd189}, [%rd11];
ld.shared.v2.f64 {%fd192, %fd193}, [%rd7];
mul.f64 %fd196, %fd188, %fd192;
mul.f64 %fd197, %fd196, %fd1;
sub.f64 %fd198, %fd184, %fd366;
mul.f64 %fd199, %fd367, %fd198;
sub.f64 %fd200, %fd197, %fd38;
mul.f64 %fd201, %fd39, %fd199;
sub.f64 %fd202, %fd200, %fd201;
mul.f64 %fd203, %fd183, %fd202;
mov.b64 %rd80, %fd203;
mul.f64 %fd204, %fd189, %fd193;
mul.f64 %fd205, %fd204, %fd1;
sub.f64 %fd206, %fd185, %fd366;
mul.f64 %fd207, %fd367, %fd206;
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
mad.lo.s32 %r309, %r601, %r4, %r11;
mad.lo.s32 %r310, %r309, %r108, %r15;
mul.wide.s32 %rd82, %r310, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
// end inline asm
$L__BB0_40:
add.s32 %r601, %r601, 1;
setp.lt.s32 %p71, %r601, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
mov.u32 %r311, %tid.z;
mad.lo.s32 %r24, %r311, %r4, %r8;
mad.lo.s32 %r25, %r24, %r3, %r6;
mul.wide.u32 %rd83, %r25, 8;
add.s64 %rd22, %rd44, %rd83;
clz.b32 %r312, %r4;
mov.u32 %r313, 31;
sub.s32 %r314, %r313, %r312;
mov.u32 %r315, 1;
shl.b32 %r26, %r315, %r314;
setp.lt.u32 %p72, %r8, %r26;
add.s32 %r316, %r26, %r8;
setp.lt.u32 %p73, %r316, %r4;
and.pred %p5, %p72, %p73;
shl.b32 %r317, %r3, %r314;
add.s32 %r318, %r25, %r317;
mul.wide.s32 %rd85, %r318, 8;
add.s64 %rd23, %rd44, %rd85;
shr.u32 %r319, %r26, 31;
add.s32 %r320, %r26, %r319;
shr.s32 %r605, %r320, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
ld.shared.f64 %fd212, [%rd23];
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
setp.lt.s32 %p75, %r26, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
mov.u32 %r602, %r605;
$L__BB0_45:
setp.ge.u32 %p76, %r8, %r602;
@%p76 bra $L__BB0_47;
mad.lo.s32 %r321, %r602, %r3, %r25;
mul.wide.s32 %rd86, %r321, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
shr.u32 %r29, %r602, 1;
setp.gt.u32 %p77, %r602, 3;
mov.u32 %r602, %r29;
@%p77 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r322, %r25, %r3;
mul.wide.u32 %rd89, %r322, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f64 %fd219, [%rd22];
add.f64 %fd380, %fd219, 0d0000000000000000;
@%p79 bra $L__BB0_51;
ld.shared.f64 %fd220, [%rd24];
add.f64 %fd380, %fd380, %fd220;
$L__BB0_51:
bar.sync 0;
st.shared.f64 [%rd22], %fd369;
bar.sync 0;
@%p74 bra $L__BB0_53;
ld.shared.f64 %fd221, [%rd23];
ld.shared.f64 %fd222, [%rd22];
add.f64 %fd223, %fd221, %fd222;
st.shared.f64 [%rd22], %fd223;
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
mov.u32 %r603, %r605;
$L__BB0_55:
setp.ge.u32 %p82, %r8, %r603;
@%p82 bra $L__BB0_57;
mad.lo.s32 %r323, %r603, %r3, %r25;
mul.wide.s32 %rd91, %r323, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
shr.u32 %r31, %r603, 1;
setp.gt.u32 %p83, %r603, 3;
mov.u32 %r603, %r31;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f64 %fd228, [%rd22];
add.f64 %fd381, %fd228, 0d0000000000000000;
@%p85 bra $L__BB0_61;
ld.shared.f64 %fd229, [%rd24];
add.f64 %fd381, %fd381, %fd229;
$L__BB0_61:
bar.sync 0;
st.shared.f64 [%rd22], %fd378;
bar.sync 0;
@%p74 bra $L__BB0_63;
ld.shared.f64 %fd230, [%rd23];
ld.shared.f64 %fd231, [%rd22];
add.f64 %fd232, %fd230, %fd231;
st.shared.f64 [%rd22], %fd232;
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
mov.u32 %r604, %r605;
$L__BB0_65:
setp.ge.u32 %p88, %r8, %r604;
@%p88 bra $L__BB0_67;
mad.lo.s32 %r324, %r604, %r3, %r25;
mul.wide.s32 %rd94, %r324, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
shr.u32 %r33, %r604, 1;
setp.gt.u32 %p89, %r604, 3;
mov.u32 %r604, %r33;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f64 %fd237, [%rd22];
add.f64 %fd382, %fd237, 0d0000000000000000;
@%p91 bra $L__BB0_71;
ld.shared.f64 %fd238, [%rd24];
add.f64 %fd382, %fd382, %fd238;
$L__BB0_71:
bar.sync 0;
st.shared.f64 [%rd22], %fd379;
bar.sync 0;
@%p74 bra $L__BB0_73;
ld.shared.f64 %fd239, [%rd23];
ld.shared.f64 %fd240, [%rd22];
add.f64 %fd241, %fd239, %fd240;
st.shared.f64 [%rd22], %fd241;
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
setp.ge.u32 %p94, %r8, %r605;
@%p94 bra $L__BB0_76;
mad.lo.s32 %r325, %r605, %r3, %r25;
mul.wide.s32 %rd97, %r325, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
shr.u32 %r35, %r605, 1;
setp.gt.u32 %p95, %r605, 3;
mov.u32 %r605, %r35;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
setp.lt.u32 %p97, %r4, 2;
ld.shared.f64 %fd246, [%rd22];
add.f64 %fd383, %fd246, 0d0000000000000000;
@%p97 bra $L__BB0_80;
ld.shared.f64 %fd247, [%rd24];
add.f64 %fd383, %fd383, %fd247;
$L__BB0_80:
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
mov.u32 %r334, %ctaid.y;
mad.lo.s32 %r335, %r108, %r334, %r7;
mul.wide.s32 %rd102, %r335, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
mov.b64 {%r326, %r327}, %rd103;
mov.b64 %rd104, %fd381;
mov.b64 {%r328, %r329}, %rd104;
// begin inline asm
st.volatile.global.v4.s32 [%rd100], {%r326,%r327,%r328,%r329};
// end inline asm
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
mov.b64 {%r330, %r331}, %rd105;
mov.b64 %rd106, %fd383;
mov.b64 {%r332, %r333}, %rd106;
// begin inline asm
st.volatile.global.v4.s32 [%rd101], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_82:
mov.u32 %r36, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r336, %r6, %r8;
or.b32 %r338, %r336, %r311;
setp.ne.s32 %p98, %r338, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
mov.u32 %r339, %ctaid.x;
mov.u32 %r340, %ctaid.z;
mov.u32 %r341, %nctaid.x;
mad.lo.s32 %r342, %r340, %r341, %r339;
mul.wide.s32 %rd108, %r342, 8;
add.s64 %rd27, %rd107, %rd108;
add.s32 %r343, %r9, -1;
setp.eq.s32 %p99, %r36, %r343;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
mov.u32 %r606, 8;
$L__BB0_85:
// begin inline asm
nanosleep.u32 %r606;
// end inline asm
setp.lt.u32 %p101, %r606, 256;
selp.u32 %r346, 1, 0, %p101;
shl.b32 %r606, %r606, %r346;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
add.s32 %r347, %r4, %r2;
add.s32 %r348, %r347, -1;
div.s32 %r349, %r348, %r4;
add.s32 %r350, %r9, %r349;
add.s32 %r351, %r350, -1;
div.s32 %r39, %r351, %r9;
setp.lt.s32 %p103, %r39, 1;
@%p103 bra $L__BB0_133;
add.s32 %r353, %r9, %r3;
add.s32 %r354, %r353, -1;
shl.b32 %r40, %r8, 1;
shl.b32 %r355, %r4, 1;
mad.lo.s32 %r43, %r355, %r36, %r40;
or.b32 %r41, %r43, 1;
mul.lo.s32 %r42, %r355, %r9;
shr.u32 %r44, %r3, 5;
mul.lo.s32 %r356, %r24, %r44;
shr.u32 %r45, %r6, 5;
add.s32 %r357, %r356, %r45;
mul.wide.u32 %rd117, %r357, 8;
add.s64 %rd29, %rd44, %rd117;
and.b32 %r46, %r6, 31;
add.s32 %r358, %r356, %r46;
mul.wide.u32 %rd119, %r358, 8;
add.s64 %rd30, %rd44, %rd119;
div.s32 %r47, %r354, %r3;
mov.u32 %r607, 0;
$L__BB0_88:
.pragma "nounroll";
setp.lt.s32 %p104, %r47, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
mul.lo.s32 %r360, %r42, %r607;
add.s32 %r49, %r41, %r360;
add.s32 %r50, %r43, %r360;
mov.u32 %r608, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
setp.ge.s32 %p105, %r49, %r108;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
mad.lo.s32 %r52, %r608, %r3, %r6;
setp.ge.s32 %p106, %r52, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
mad.lo.s32 %r365, %r52, %r108, %r50;
mul.wide.s32 %rd121, %r365, 8;
add.s64 %rd120, %rd42, %rd121;
// begin inline asm
ld.volatile.global.v4.s32 {%r361,%r362,%r363,%r364}, [%rd120];
// end inline asm
mov.b64 %rd122, {%r361, %r362};
mov.b64 %fd387, %rd122;
mov.b64 %rd123, {%r363, %r364};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
add.s32 %r608, %r608, 1;
setp.lt.s32 %p107, %r608, %r47;
@%p107 bra $L__BB0_90;
$L__BB0_94:
// begin inline asm
mov.b64 {%r366,%r367}, %fd389;
// end inline asm
mov.u32 %r386, 31;
mov.u32 %r387, 16;
mov.u32 %r388, -1;
shfl.sync.bfly.b32 %r369|%p108, %r367, %r387, %r386, %r388;
shfl.sync.bfly.b32 %r368|%p109, %r366, %r387, %r386, %r388;
// begin inline asm
mov.b64 %fd257, {%r368,%r369};
// end inline asm
add.f64 %fd258, %fd389, %fd257;
// begin inline asm
mov.b64 {%r370,%r371}, %fd258;
// end inline asm
mov.u32 %r389, 8;
shfl.sync.bfly.b32 %r373|%p110, %r371, %r389, %r386, %r388;
shfl.sync.bfly.b32 %r372|%p111, %r370, %r389, %r386, %r388;
// begin inline asm
mov.b64 %fd259, {%r372,%r373};
// end inline asm
add.f64 %fd260, %fd258, %fd259;
// begin inline asm
mov.b64 {%r374,%r375}, %fd260;
// end inline asm
mov.u32 %r390, 4;
shfl.sync.bfly.b32 %r377|%p112, %r375, %r390, %r386, %r388;
shfl.sync.bfly.b32 %r376|%p113, %r374, %r390, %r386, %r388;
// begin inline asm
mov.b64 %fd261, {%r376,%r377};
// end inline asm
add.f64 %fd262, %fd260, %fd261;
// begin inline asm
mov.b64 {%r378,%r379}, %fd262;
// end inline asm
mov.u32 %r391, 2;
shfl.sync.bfly.b32 %r381|%p114, %r379, %r391, %r386, %r388;
shfl.sync.bfly.b32 %r380|%p115, %r378, %r391, %r386, %r388;
// begin inline asm
mov.b64 %fd263, {%r380,%r381};
// end inline asm
add.f64 %fd264, %fd262, %fd263;
// begin inline asm
mov.b64 {%r382,%r383}, %fd264;
// end inline asm
mov.u32 %r392, 1;
shfl.sync.bfly.b32 %r385|%p116, %r383, %r392, %r386, %r388;
shfl.sync.bfly.b32 %r384|%p117, %r382, %r392, %r386, %r388;
// begin inline asm
mov.b64 %fd265, {%r384,%r385};
// end inline asm
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
setp.ne.s32 %p118, %r46, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
setp.ne.s32 %p119, %r45, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
setp.ge.u32 %p120, %r46, %r44;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
// begin inline asm
mov.b64 {%r393,%r394}, %fd390;
// end inline asm
mov.u32 %r413, 31;
mov.u32 %r414, 16;
mov.u32 %r415, -1;
shfl.sync.bfly.b32 %r396|%p121, %r394, %r414, %r413, %r415;
shfl.sync.bfly.b32 %r395|%p122, %r393, %r414, %r413, %r415;
// begin inline asm
mov.b64 %fd268, {%r395,%r396};
// end inline asm
add.f64 %fd269, %fd390, %fd268;
// begin inline asm
mov.b64 {%r397,%r398}, %fd269;
// end inline asm
mov.u32 %r416, 8;
shfl.sync.bfly.b32 %r400|%p123, %r398, %r416, %r413, %r415;
shfl.sync.bfly.b32 %r399|%p124, %r397, %r416, %r413, %r415;
// begin inline asm
mov.b64 %fd270, {%r399,%r400};
// end inline asm
add.f64 %fd271, %fd269, %fd270;
// begin inline asm
mov.b64 {%r401,%r402}, %fd271;
// end inline asm
mov.u32 %r417, 4;
shfl.sync.bfly.b32 %r404|%p125, %r402, %r417, %r413, %r415;
shfl.sync.bfly.b32 %r403|%p126, %r401, %r417, %r413, %r415;
// begin inline asm
mov.b64 %fd272, {%r403,%r404};
// end inline asm
add.f64 %fd273, %fd271, %fd272;
// begin inline asm
mov.b64 {%r405,%r406}, %fd273;
// end inline asm
mov.u32 %r418, 2;
shfl.sync.bfly.b32 %r408|%p127, %r406, %r418, %r413, %r415;
shfl.sync.bfly.b32 %r407|%p128, %r405, %r418, %r413, %r415;
// begin inline asm
mov.b64 %fd274, {%r407,%r408};
// end inline asm
add.f64 %fd275, %fd273, %fd274;
// begin inline asm
mov.b64 {%r409,%r410}, %fd275;
// end inline asm
mov.u32 %r419, 1;
shfl.sync.bfly.b32 %r412|%p129, %r410, %r419, %r413, %r415;
shfl.sync.bfly.b32 %r411|%p130, %r409, %r419, %r413, %r415;
// begin inline asm
mov.b64 %fd276, {%r411,%r412};
// end inline asm
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
setp.eq.s32 %p132, %r46, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r420,%r421}, %fd388;
// end inline asm
mov.u32 %r440, 31;
mov.u32 %r441, 16;
mov.u32 %r442, -1;
shfl.sync.bfly.b32 %r423|%p133, %r421, %r441, %r440, %r442;
shfl.sync.bfly.b32 %r422|%p134, %r420, %r441, %r440, %r442;
// begin inline asm
mov.b64 %fd278, {%r422,%r423};
// end inline asm
add.f64 %fd279, %fd388, %fd278;
// begin inline asm
mov.b64 {%r424,%r425}, %fd279;
// end inline asm
mov.u32 %r443, 8;
shfl.sync.bfly.b32 %r427|%p135, %r425, %r443, %r440, %r442;
shfl.sync.bfly.b32 %r426|%p136, %r424, %r443, %r440, %r442;
// begin inline asm
mov.b64 %fd280, {%r426,%r427};
// end inline asm
add.f64 %fd281, %fd279, %fd280;
// begin inline asm
mov.b64 {%r428,%r429}, %fd281;
// end inline asm
mov.u32 %r444, 4;
shfl.sync.bfly.b32 %r431|%p137, %r429, %r444, %r440, %r442;
shfl.sync.bfly.b32 %r430|%p138, %r428, %r444, %r440, %r442;
// begin inline asm
mov.b64 %fd282, {%r430,%r431};
// end inline asm
add.f64 %fd283, %fd281, %fd282;
// begin inline asm
mov.b64 {%r432,%r433}, %fd283;
// end inline asm
mov.u32 %r445, 2;
shfl.sync.bfly.b32 %r435|%p139, %r433, %r445, %r440, %r442;
shfl.sync.bfly.b32 %r434|%p140, %r432, %r445, %r440, %r442;
// begin inline asm
mov.b64 %fd284, {%r434,%r435};
// end inline asm
add.f64 %fd285, %fd283, %fd284;
// begin inline asm
mov.b64 {%r436,%r437}, %fd285;
// end inline asm
mov.u32 %r446, 1;
shfl.sync.bfly.b32 %r439|%p141, %r437, %r446, %r440, %r442;
shfl.sync.bfly.b32 %r438|%p142, %r436, %r446, %r440, %r442;
// begin inline asm
mov.b64 %fd286, {%r438,%r439};
// end inline asm
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
st.shared.f64 [%rd29], %fd393;
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
setp.ge.u32 %p144, %r46, %r44;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
// begin inline asm
mov.b64 {%r447,%r448}, %fd392;
// end inline asm
mov.u32 %r467, 31;
mov.u32 %r468, 16;
mov.u32 %r469, -1;
shfl.sync.bfly.b32 %r450|%p145, %r448, %r468, %r467, %r469;
shfl.sync.bfly.b32 %r449|%p146, %r447, %r468, %r467, %r469;
// begin inline asm
mov.b64 %fd290, {%r449,%r450};
// end inline asm
add.f64 %fd291, %fd392, %fd290;
// begin inline asm
mov.b64 {%r451,%r452}, %fd291;
// end inline asm
mov.u32 %r470, 8;
shfl.sync.bfly.b32 %r454|%p147, %r452, %r470, %r467, %r469;
shfl.sync.bfly.b32 %r453|%p148, %r451, %r470, %r467, %r469;
// begin inline asm
mov.b64 %fd292, {%r453,%r454};
// end inline asm
add.f64 %fd293, %fd291, %fd292;
// begin inline asm
mov.b64 {%r455,%r456}, %fd293;
// end inline asm
mov.u32 %r471, 4;
shfl.sync.bfly.b32 %r458|%p149, %r456, %r471, %r467, %r469;
shfl.sync.bfly.b32 %r457|%p150, %r455, %r471, %r467, %r469;
// begin inline asm
mov.b64 %fd294, {%r457,%r458};
// end inline asm
add.f64 %fd295, %fd293, %fd294;
// begin inline asm
mov.b64 {%r459,%r460}, %fd295;
// end inline asm
mov.u32 %r472, 2;
shfl.sync.bfly.b32 %r462|%p151, %r460, %r472, %r467, %r469;
shfl.sync.bfly.b32 %r461|%p152, %r459, %r472, %r467, %r469;
// begin inline asm
mov.b64 %fd296, {%r461,%r462};
// end inline asm
add.f64 %fd297, %fd295, %fd296;
// begin inline asm
mov.b64 {%r463,%r464}, %fd297;
// end inline asm
mov.u32 %r473, 1;
shfl.sync.bfly.b32 %r466|%p153, %r464, %r473, %r467, %r469;
shfl.sync.bfly.b32 %r465|%p154, %r463, %r473, %r467, %r469;
// begin inline asm
mov.b64 %fd298, {%r465,%r466};
// end inline asm
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
mul.lo.s32 %r54, %r42, %r607;
add.s32 %r474, %r41, %r54;
setp.ge.s32 %p156, %r474, %r108;
@%p156 bra $L__BB0_109;
add.s32 %r479, %r43, %r54;
mul.wide.s32 %rd125, %r479, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
mov.b64 {%r475, %r476}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
mov.b64 {%r477, %r478}, %rd127;
// begin inline asm
st.global.cs.v4.s32 [%rd124], {%r475,%r476,%r477,%r478};
// end inline asm
$L__BB0_109:
add.s32 %r607, %r607, 1;
setp.lt.s32 %p158, %r607, %r39;
@%p158 bra $L__BB0_88;
mad.lo.s32 %r56, %r108, %r6, %r40;
shl.b32 %r57, %r36, 1;
shl.b32 %r58, %r9, 1;
mul.lo.s32 %r59, %r108, %r3;
mov.u32 %r609, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
mad.lo.s32 %r61, %r42, %r609, %r41;
mad.lo.s32 %r482, %r58, %r609, %r57;
mad.lo.s32 %r611, %r4, %r482, %r56;
mov.u32 %r612, 0;
mov.f64 %fd304, 0d0000000000000000;
mov.u32 %r610, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
setp.ge.s32 %p160, %r61, %r108;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
setp.ge.s32 %p161, %r610, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
mul.wide.s32 %rd129, %r611, 8;
add.s64 %rd128, %rd41, %rd129;
// begin inline asm
ld.volatile.global.v4.s32 {%r483,%r484,%r485,%r486}, [%rd128];
// end inline asm
mov.b64 %rd130, {%r483, %r484};
mov.b64 %fd397, %rd130;
mov.b64 %rd131, {%r485, %r486};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
add.s32 %r611, %r611, %r59;
add.s32 %r610, %r610, %r3;
add.s32 %r612, %r612, 1;
setp.lt.s32 %p162, %r612, %r47;
@%p162 bra $L__BB0_113;
$L__BB0_117:
// begin inline asm
mov.b64 {%r487,%r488}, %fd399;
// end inline asm
mov.u32 %r507, 31;
mov.u32 %r508, 16;
mov.u32 %r509, -1;
shfl.sync.bfly.b32 %r490|%p163, %r488, %r508, %r507, %r509;
shfl.sync.bfly.b32 %r489|%p164, %r487, %r508, %r507, %r509;
// begin inline asm
mov.b64 %fd310, {%r489,%r490};
// end inline asm
add.f64 %fd311, %fd399, %fd310;
// begin inline asm
mov.b64 {%r491,%r492}, %fd311;
// end inline asm
mov.u32 %r510, 8;
shfl.sync.bfly.b32 %r494|%p165, %r492, %r510, %r507, %r509;
shfl.sync.bfly.b32 %r493|%p166, %r491, %r510, %r507, %r509;
// begin inline asm
mov.b64 %fd312, {%r493,%r494};
// end inline asm
add.f64 %fd313, %fd311, %fd312;
// begin inline asm
mov.b64 {%r495,%r496}, %fd313;
// end inline asm
mov.u32 %r511, 4;
shfl.sync.bfly.b32 %r498|%p167, %r496, %r511, %r507, %r509;
shfl.sync.bfly.b32 %r497|%p168, %r495, %r511, %r507, %r509;
// begin inline asm
mov.b64 %fd314, {%r497,%r498};
// end inline asm
add.f64 %fd315, %fd313, %fd314;
// begin inline asm
mov.b64 {%r499,%r500}, %fd315;
// end inline asm
mov.u32 %r512, 2;
shfl.sync.bfly.b32 %r502|%p169, %r500, %r512, %r507, %r509;
shfl.sync.bfly.b32 %r501|%p170, %r499, %r512, %r507, %r509;
// begin inline asm
mov.b64 %fd316, {%r501,%r502};
// end inline asm
add.f64 %fd317, %fd315, %fd316;
// begin inline asm
mov.b64 {%r503,%r504}, %fd317;
// end inline asm
mov.u32 %r513, 1;
shfl.sync.bfly.b32 %r506|%p171, %r504, %r513, %r507, %r509;
shfl.sync.bfly.b32 %r505|%p172, %r503, %r513, %r507, %r509;
// begin inline asm
mov.b64 %fd318, {%r505,%r506};
// end inline asm
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
st.shared.f64 [%rd29], %fd401;
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
setp.ge.u32 %p175, %r46, %r44;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
// begin inline asm
mov.b64 {%r514,%r515}, %fd400;
// end inline asm
mov.u32 %r534, 31;
mov.u32 %r535, 16;
mov.u32 %r536, -1;
shfl.sync.bfly.b32 %r517|%p176, %r515, %r535, %r534, %r536;
shfl.sync.bfly.b32 %r516|%p177, %r514, %r535, %r534, %r536;
// begin inline asm
mov.b64 %fd321, {%r516,%r517};
// end inline asm
add.f64 %fd322, %fd400, %fd321;
// begin inline asm
mov.b64 {%r518,%r519}, %fd322;
// end inline asm
mov.u32 %r537, 8;
shfl.sync.bfly.b32 %r521|%p178, %r519, %r537, %r534, %r536;
shfl.sync.bfly.b32 %r520|%p179, %r518, %r537, %r534, %r536;
// begin inline asm
mov.b64 %fd323, {%r520,%r521};
// end inline asm
add.f64 %fd324, %fd322, %fd323;
// begin inline asm
mov.b64 {%r522,%r523}, %fd324;
// end inline asm
mov.u32 %r538, 4;
shfl.sync.bfly.b32 %r525|%p180, %r523, %r538, %r534, %r536;
shfl.sync.bfly.b32 %r524|%p181, %r522, %r538, %r534, %r536;
// begin inline asm
mov.b64 %fd325, {%r524,%r525};
// end inline asm
add.f64 %fd326, %fd324, %fd325;
// begin inline asm
mov.b64 {%r526,%r527}, %fd326;
// end inline asm
mov.u32 %r539, 2;
shfl.sync.bfly.b32 %r529|%p182, %r527, %r539, %r534, %r536;
shfl.sync.bfly.b32 %r528|%p183, %r526, %r539, %r534, %r536;
// begin inline asm
mov.b64 %fd327, {%r528,%r529};
// end inline asm
add.f64 %fd328, %fd326, %fd327;
// begin inline asm
mov.b64 {%r530,%r531}, %fd328;
// end inline asm
mov.u32 %r540, 1;
shfl.sync.bfly.b32 %r533|%p184, %r531, %r540, %r534, %r536;
shfl.sync.bfly.b32 %r532|%p185, %r530, %r540, %r534, %r536;
// begin inline asm
mov.b64 %fd329, {%r532,%r533};
// end inline asm
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r541,%r542}, %fd398;
// end inline asm
mov.u32 %r561, 31;
mov.u32 %r562, 16;
mov.u32 %r563, -1;
shfl.sync.bfly.b32 %r544|%p188, %r542, %r562, %r561, %r563;
shfl.sync.bfly.b32 %r543|%p189, %r541, %r562, %r561, %r563;
// begin inline asm
mov.b64 %fd331, {%r543,%r544};
// end inline asm
add.f64 %fd332, %fd398, %fd331;
// begin inline asm
mov.b64 {%r545,%r546}, %fd332;
// end inline asm
mov.u32 %r564, 8;
shfl.sync.bfly.b32 %r548|%p190, %r546, %r564, %r561, %r563;
shfl.sync.bfly.b32 %r547|%p191, %r545, %r564, %r561, %r563;
// begin inline asm
mov.b64 %fd333, {%r547,%r548};
// end inline asm
add.f64 %fd334, %fd332, %fd333;
// begin inline asm
mov.b64 {%r549,%r550}, %fd334;
// end inline asm
mov.u32 %r565, 4;
shfl.sync.bfly.b32 %r552|%p192, %r550, %r565, %r561, %r563;
shfl.sync.bfly.b32 %r551|%p193, %r549, %r565, %r561, %r563;
// begin inline asm
mov.b64 %fd335, {%r551,%r552};
// end inline asm
add.f64 %fd336, %fd334, %fd335;
// begin inline asm
mov.b64 {%r553,%r554}, %fd336;
// end inline asm
mov.u32 %r566, 2;
shfl.sync.bfly.b32 %r556|%p194, %r554, %r566, %r561, %r563;
shfl.sync.bfly.b32 %r555|%p195, %r553, %r566, %r561, %r563;
// begin inline asm
mov.b64 %fd337, {%r555,%r556};
// end inline asm
add.f64 %fd338, %fd336, %fd337;
// begin inline asm
mov.b64 {%r557,%r558}, %fd338;
// end inline asm
mov.u32 %r567, 1;
shfl.sync.bfly.b32 %r560|%p196, %r558, %r567, %r561, %r563;
shfl.sync.bfly.b32 %r559|%p197, %r557, %r567, %r561, %r563;
// begin inline asm
mov.b64 %fd339, {%r559,%r560};
// end inline asm
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
st.shared.f64 [%rd29], %fd403;
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
setp.ge.u32 %p199, %r46, %r44;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
// begin inline asm
mov.b64 {%r568,%r569}, %fd402;
// end inline asm
mov.u32 %r588, 31;
mov.u32 %r589, 16;
mov.u32 %r590, -1;
shfl.sync.bfly.b32 %r571|%p200, %r569, %r589, %r588, %r590;
shfl.sync.bfly.b32 %r570|%p201, %r568, %r589, %r588, %r590;
// begin inline asm
mov.b64 %fd343, {%r570,%r571};
// end inline asm
add.f64 %fd344, %fd402, %fd343;
// begin inline asm
mov.b64 {%r572,%r573}, %fd344;
// end inline asm
mov.u32 %r591, 8;
shfl.sync.bfly.b32 %r575|%p202, %r573, %r591, %r588, %r590;
shfl.sync.bfly.b32 %r574|%p203, %r572, %r591, %r588, %r590;
// begin inline asm
mov.b64 %fd345, {%r574,%r575};
// end inline asm
add.f64 %fd346, %fd344, %fd345;
// begin inline asm
mov.b64 {%r576,%r577}, %fd346;
// end inline asm
mov.u32 %r592, 4;
shfl.sync.bfly.b32 %r579|%p204, %r577, %r592, %r588, %r590;
shfl.sync.bfly.b32 %r578|%p205, %r576, %r592, %r588, %r590;
// begin inline asm
mov.b64 %fd347, {%r578,%r579};
// end inline asm
add.f64 %fd348, %fd346, %fd347;
// begin inline asm
mov.b64 {%r580,%r581}, %fd348;
// end inline asm
mov.u32 %r593, 2;
shfl.sync.bfly.b32 %r583|%p206, %r581, %r593, %r588, %r590;
shfl.sync.bfly.b32 %r582|%p207, %r580, %r593, %r588, %r590;
// begin inline asm
mov.b64 %fd349, {%r582,%r583};
// end inline asm
add.f64 %fd350, %fd348, %fd349;
// begin inline asm
mov.b64 {%r584,%r585}, %fd350;
// end inline asm
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r587|%p208, %r585, %r594, %r588, %r590;
shfl.sync.bfly.b32 %r586|%p209, %r584, %r594, %r588, %r590;
// begin inline asm
mov.b64 %fd351, {%r586,%r587};
// end inline asm
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
mul.lo.s32 %r69, %r42, %r609;
add.s32 %r595, %r41, %r69;
setp.ge.s32 %p211, %r595, %r108;
@%p211 bra $L__BB0_132;
add.s32 %r600, %r43, %r69;
mul.wide.s32 %rd133, %r600, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
mov.b64 {%r596, %r597}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
mov.b64 {%r598, %r599}, %rd135;
// begin inline asm
st.global.cs.v4.s32 [%rd132], {%r596,%r597,%r598,%r599};
// end inline asm
$L__BB0_132:
add.s32 %r609, %r609, 1;
setp.lt.s32 %p213, %r609, %r39;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72335arrayE[];
.entry _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
.reg .b32 %r<612>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
ld.param.v2.u32 {%r106, %r107}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r116, %r117}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r120, %r121}, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd38, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72339nvfuser_6ENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r142, %r107, 1;
shr.u32 %r143, %r142, 31;
add.s32 %r144, %r142, %r143;
shr.s32 %r2, %r144, 1;
mov.u32 %r3, %ntid.x;
max.s32 %r145, %r2, %r3;
add.s32 %r146, %r145, 31;
shr.s32 %r147, %r146, 31;
shr.u32 %r148, %r147, 27;
add.s32 %r149, %r146, %r148;
shr.u32 %r150, %r149, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r151, %r4, %r150;
shl.b32 %r152, %r151, 8;
cvt.u64.u32 %rd1, %r152;
mul.lo.s32 %r153, %r4, %r2;
shl.b32 %r154, %r153, 4;
or.b32 %r155, %r154, 15;
and.b32 %r5, %r155, -16;
add.s32 %r156, %r155, %r5;
and.b32 %r157, %r156, -16;
cvt.s64.s32 %rd2, %r157;
mov.u64 %rd44, _ZN56_GLOBAL__N__00000000_18___tmp_nvfuser_6_cu_41cca52c_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
cvt.rn.f64.s32 %fd1, %r107;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
or.b32 %r158, %r7, 1;
setp.lt.s32 %p7, %r158, %r107;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r159, smem_ptr; }
// end inline asm
shl.b32 %r162, %r6, 4;
add.s32 %r160, %r159, %r162;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r161, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r161, 0;
cp.async.ca.shared.global [%r160], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r163, %r4, 215;
div.s32 %r164, %r163, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r165, %r9, %r164;
add.s32 %r166, %r165, -1;
div.s32 %r10, %r166, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r168, %ctaid.y;
mul.lo.s32 %r169, %r10, %r4;
mul.lo.s32 %r11, %r169, %r168;
mad.lo.s32 %r170, %r2, %r8, %r6;
shl.b32 %r12, %r170, 4;
mul.lo.s32 %r171, %r107, %r8;
cvt.s64.s32 %rd53, %r171;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r172, %r11, %r107;
cvt.s64.s32 %rd6, %r172;
mul.lo.s32 %r13, %r107, %r4;
mul.lo.s32 %r14, %r10, %r168;
shl.b32 %r173, %r8, 1;
mad.lo.s32 %r174, %r173, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r174, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r177, %r176, %r15;
shr.u32 %r16, %r6, 5;
add.s32 %r178, %r177, %r16;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r6, 31;
add.s32 %r179, %r177, %r17;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r176, 8;
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
mov.u32 %r600, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
// end inline asm
add.s32 %r183, %r182, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
// end inline asm
add.s32 %r193, %r192, %r12;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r180, %r600, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r185, %r13, %r600;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r184, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r184, 0;
cp.async.ca.shared.global [%r183], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r186, %r14, %r600;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
$L__BB0_13:
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
bra.uni $L__BB0_14;
$L__BB0_10:
mov.f64 %fd364, 0d0000000000000000;
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
add.s32 %r188, %r14, %r600;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
add.s32 %r190, %r14, %r600;
mad.lo.s32 %r21, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
setp.gt.s32 %p16, %r21, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
mul.lo.s32 %r191, %r21, %r116;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
setp.lt.s32 %p17, %r21, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
mul.lo.s32 %r195, %r13, %r600;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r194, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r194, 0;
cp.async.ca.shared.global [%r193], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
mul.lo.s32 %r196, %r21, %r120;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f64 %fd371, %fd370;
@%p18 bra $L__BB0_22;
ld.shared.v2.f64 {%fd118, %fd119}, [%rd7];
ld.shared.v2.f64 {%fd122, %fd123}, [%rd9];
ld.shared.v2.f64 {%fd126, %fd127}, [%rd11];
mul.f64 %fd130, %fd126, %fd118;
add.f64 %fd131, %fd130, 0d0000000000000000;
sub.f64 %fd132, %fd122, %fd366;
mul.f64 %fd133, %fd367, %fd132;
fma.rn.f64 %fd134, %fd130, %fd133, 0d0000000000000000;
fma.rn.f64 %fd368, %fd133, %fd118, %fd368;
mul.f64 %fd135, %fd127, %fd119;
add.f64 %fd371, %fd131, %fd135;
sub.f64 %fd136, %fd123, %fd366;
mul.f64 %fd137, %fd367, %fd136;
fma.rn.f64 %fd370, %fd135, %fd137, %fd134;
fma.rn.f64 %fd369, %fd137, %fd119, %fd369;
$L__BB0_22:
// begin inline asm
mov.b64 {%r197,%r198}, %fd371;
// end inline asm
mov.u32 %r217, 31;
mov.u32 %r218, 16;
mov.u32 %r219, -1;
shfl.sync.bfly.b32 %r200|%p21, %r198, %r218, %r217, %r219;
shfl.sync.bfly.b32 %r199|%p22, %r197, %r218, %r217, %r219;
// begin inline asm
mov.b64 %fd139, {%r199,%r200};
// end inline asm
add.f64 %fd140, %fd371, %fd139;
// begin inline asm
mov.b64 {%r201,%r202}, %fd140;
// end inline asm
mov.u32 %r220, 8;
shfl.sync.bfly.b32 %r204|%p23, %r202, %r220, %r217, %r219;
shfl.sync.bfly.b32 %r203|%p24, %r201, %r220, %r217, %r219;
// begin inline asm
mov.b64 %fd141, {%r203,%r204};
// end inline asm
add.f64 %fd142, %fd140, %fd141;
// begin inline asm
mov.b64 {%r205,%r206}, %fd142;
// end inline asm
mov.u32 %r221, 4;
shfl.sync.bfly.b32 %r208|%p25, %r206, %r221, %r217, %r219;
shfl.sync.bfly.b32 %r207|%p26, %r205, %r221, %r217, %r219;
// begin inline asm
mov.b64 %fd143, {%r207,%r208};
// end inline asm
add.f64 %fd144, %fd142, %fd143;
// begin inline asm
mov.b64 {%r209,%r210}, %fd144;
// end inline asm
mov.u32 %r222, 2;
shfl.sync.bfly.b32 %r212|%p27, %r210, %r222, %r217, %r219;
shfl.sync.bfly.b32 %r211|%p28, %r209, %r222, %r217, %r219;
// begin inline asm
mov.b64 %fd145, {%r211,%r212};
// end inline asm
add.f64 %fd146, %fd144, %fd145;
// begin inline asm
mov.b64 {%r213,%r214}, %fd146;
// end inline asm
mov.u32 %r223, 1;
shfl.sync.bfly.b32 %r216|%p29, %r214, %r223, %r217, %r219;
shfl.sync.bfly.b32 %r215|%p30, %r213, %r223, %r217, %r219;
// begin inline asm
mov.b64 %fd147, {%r215,%r216};
// end inline asm
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
setp.ne.s32 %p31, %r17, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
setp.ne.s32 %p32, %r16, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
setp.ge.u32 %p33, %r17, %r15;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
$L__BB0_27:
// begin inline asm
mov.b64 {%r224,%r225}, %fd372;
// end inline asm
mov.u32 %r244, 31;
mov.u32 %r245, 16;
mov.u32 %r246, -1;
shfl.sync.bfly.b32 %r227|%p34, %r225, %r245, %r244, %r246;
shfl.sync.bfly.b32 %r226|%p35, %r224, %r245, %r244, %r246;
// begin inline asm
mov.b64 %fd150, {%r226,%r227};
// end inline asm
add.f64 %fd151, %fd372, %fd150;
// begin inline asm
mov.b64 {%r228,%r229}, %fd151;
// end inline asm
mov.u32 %r247, 8;
shfl.sync.bfly.b32 %r231|%p36, %r229, %r247, %r244, %r246;
shfl.sync.bfly.b32 %r230|%p37, %r228, %r247, %r244, %r246;
// begin inline asm
mov.b64 %fd152, {%r230,%r231};
// end inline asm
add.f64 %fd153, %fd151, %fd152;
// begin inline asm
mov.b64 {%r232,%r233}, %fd153;
// end inline asm
mov.u32 %r248, 4;
shfl.sync.bfly.b32 %r235|%p38, %r233, %r248, %r244, %r246;
shfl.sync.bfly.b32 %r234|%p39, %r232, %r248, %r244, %r246;
// begin inline asm
mov.b64 %fd154, {%r234,%r235};
// end inline asm
add.f64 %fd155, %fd153, %fd154;
// begin inline asm
mov.b64 {%r236,%r237}, %fd155;
// end inline asm
mov.u32 %r249, 2;
shfl.sync.bfly.b32 %r239|%p40, %r237, %r249, %r244, %r246;
shfl.sync.bfly.b32 %r238|%p41, %r236, %r249, %r244, %r246;
// begin inline asm
mov.b64 %fd156, {%r238,%r239};
// end inline asm
add.f64 %fd157, %fd155, %fd156;
// begin inline asm
mov.b64 {%r240,%r241}, %fd157;
// end inline asm
mov.u32 %r250, 1;
shfl.sync.bfly.b32 %r243|%p42, %r241, %r250, %r244, %r246;
shfl.sync.bfly.b32 %r242|%p43, %r240, %r250, %r244, %r246;
// begin inline asm
mov.b64 %fd158, {%r242,%r243};
// end inline asm
add.f64 %fd373, %fd157, %fd158;
$L__BB0_28:
bar.sync 0;
// begin inline asm
mov.b64 {%r251,%r252}, %fd370;
// end inline asm
mov.u32 %r271, 31;
mov.u32 %r272, 16;
mov.u32 %r273, -1;
shfl.sync.bfly.b32 %r254|%p44, %r252, %r272, %r271, %r273;
shfl.sync.bfly.b32 %r253|%p45, %r251, %r272, %r271, %r273;
// begin inline asm
mov.b64 %fd160, {%r253,%r254};
// end inline asm
add.f64 %fd161, %fd370, %fd160;
// begin inline asm
mov.b64 {%r255,%r256}, %fd161;
// end inline asm
mov.u32 %r274, 8;
shfl.sync.bfly.b32 %r258|%p46, %r256, %r274, %r271, %r273;
shfl.sync.bfly.b32 %r257|%p47, %r255, %r274, %r271, %r273;
// begin inline asm
mov.b64 %fd162, {%r257,%r258};
// end inline asm
add.f64 %fd163, %fd161, %fd162;
// begin inline asm
mov.b64 {%r259,%r260}, %fd163;
// end inline asm
mov.u32 %r275, 4;
shfl.sync.bfly.b32 %r262|%p48, %r260, %r275, %r271, %r273;
shfl.sync.bfly.b32 %r261|%p49, %r259, %r275, %r271, %r273;
// begin inline asm
mov.b64 %fd164, {%r261,%r262};
// end inline asm
add.f64 %fd165, %fd163, %fd164;
// begin inline asm
mov.b64 {%r263,%r264}, %fd165;
// end inline asm
mov.u32 %r276, 2;
shfl.sync.bfly.b32 %r266|%p50, %r264, %r276, %r271, %r273;
shfl.sync.bfly.b32 %r265|%p51, %r263, %r276, %r271, %r273;
// begin inline asm
mov.b64 %fd166, {%r265,%r266};
// end inline asm
add.f64 %fd167, %fd165, %fd166;
// begin inline asm
mov.b64 {%r267,%r268}, %fd167;
// end inline asm
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r270|%p52, %r268, %r277, %r271, %r273;
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
// begin inline asm
mov.b64 %fd168, {%r269,%r270};
// end inline asm
add.f64 %fd375, %fd167, %fd168;
setp.eq.s32 %p4, %r17, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
$L__BB0_30:
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
setp.ge.u32 %p56, %r17, %r15;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
$L__BB0_33:
// begin inline asm
mov.b64 {%r278,%r279}, %fd374;
// end inline asm
mov.u32 %r298, 31;
mov.u32 %r299, 16;
mov.u32 %r300, -1;
shfl.sync.bfly.b32 %r281|%p57, %r279, %r299, %r298, %r300;
shfl.sync.bfly.b32 %r280|%p58, %r278, %r299, %r298, %r300;
// begin inline asm
mov.b64 %fd172, {%r280,%r281};
// end inline asm
add.f64 %fd173, %fd374, %fd172;
// begin inline asm
mov.b64 {%r282,%r283}, %fd173;
// end inline asm
mov.u32 %r301, 8;
shfl.sync.bfly.b32 %r285|%p59, %r283, %r301, %r298, %r300;
shfl.sync.bfly.b32 %r284|%p60, %r282, %r301, %r298, %r300;
// begin inline asm
mov.b64 %fd174, {%r284,%r285};
// end inline asm
add.f64 %fd175, %fd173, %fd174;
// begin inline asm
mov.b64 {%r286,%r287}, %fd175;
// end inline asm
mov.u32 %r302, 4;
shfl.sync.bfly.b32 %r289|%p61, %r287, %r302, %r298, %r300;
shfl.sync.bfly.b32 %r288|%p62, %r286, %r302, %r298, %r300;
// begin inline asm
mov.b64 %fd176, {%r288,%r289};
// end inline asm
add.f64 %fd177, %fd175, %fd176;
// begin inline asm
mov.b64 {%r290,%r291}, %fd177;
// end inline asm
mov.u32 %r303, 2;
shfl.sync.bfly.b32 %r293|%p63, %r291, %r303, %r298, %r300;
shfl.sync.bfly.b32 %r292|%p64, %r290, %r303, %r298, %r300;
// begin inline asm
mov.b64 %fd178, {%r292,%r293};
// end inline asm
add.f64 %fd179, %fd177, %fd178;
// begin inline asm
mov.b64 {%r294,%r295}, %fd179;
// end inline asm
mov.u32 %r304, 1;
shfl.sync.bfly.b32 %r297|%p65, %r295, %r304, %r298, %r300;
shfl.sync.bfly.b32 %r296|%p66, %r294, %r304, %r298, %r300;
// begin inline asm
mov.b64 %fd180, {%r296,%r297};
// end inline asm
add.f64 %fd375, %fd179, %fd180;
$L__BB0_34:
bar.sync 0;
setp.ne.s32 %p67, %r6, 0;
@%p67 bra $L__BB0_36;
st.shared.f64 [%rd12], %fd33;
$L__BB0_36:
bar.sync 0;
ld.shared.f64 %fd38, [%rd12];
bar.sync 0;
@%p67 bra $L__BB0_38;
add.f64 %fd181, %fd375, 0d0000000000000000;
selp.f64 %fd182, %fd181, 0d0000000000000000, %p4;
st.shared.f64 [%rd12], %fd182;
$L__BB0_38:
bar.sync 0;
ld.shared.f64 %fd39, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_40;
mul.f64 %fd183, %fd2, %fd367;
ld.shared.v2.f64 {%fd184, %fd185}, [%rd9];
ld.shared.v2.f64 {%fd188, %fd189}, [%rd11];
ld.shared.v2.f64 {%fd192, %fd193}, [%rd7];
mul.f64 %fd196, %fd188, %fd192;
mul.f64 %fd197, %fd196, %fd1;
sub.f64 %fd198, %fd184, %fd366;
mul.f64 %fd199, %fd367, %fd198;
sub.f64 %fd200, %fd197, %fd38;
mul.f64 %fd201, %fd39, %fd199;
sub.f64 %fd202, %fd200, %fd201;
mul.f64 %fd203, %fd183, %fd202;
mov.b64 %rd80, %fd203;
mul.f64 %fd204, %fd189, %fd193;
mul.f64 %fd205, %fd204, %fd1;
sub.f64 %fd206, %fd185, %fd366;
mul.f64 %fd207, %fd367, %fd206;
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
mad.lo.s32 %r309, %r21, %r107, %r7;
mul.wide.s32 %rd82, %r309, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
// end inline asm
$L__BB0_40:
add.s32 %r600, %r600, 1;
setp.lt.s32 %p71, %r600, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
mov.u32 %r310, %tid.z;
mad.lo.s32 %r23, %r310, %r4, %r8;
mad.lo.s32 %r24, %r23, %r3, %r6;
mul.wide.u32 %rd83, %r24, 8;
add.s64 %rd22, %rd44, %rd83;
clz.b32 %r311, %r4;
mov.u32 %r312, 31;
sub.s32 %r313, %r312, %r311;
mov.u32 %r314, 1;
shl.b32 %r25, %r314, %r313;
setp.lt.u32 %p72, %r8, %r25;
add.s32 %r315, %r25, %r8;
setp.lt.u32 %p73, %r315, %r4;
and.pred %p5, %p72, %p73;
shl.b32 %r316, %r3, %r313;
add.s32 %r317, %r24, %r316;
mul.wide.s32 %rd85, %r317, 8;
add.s64 %rd23, %rd44, %rd85;
shr.u32 %r318, %r25, 31;
add.s32 %r319, %r25, %r318;
shr.s32 %r604, %r319, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
ld.shared.f64 %fd212, [%rd23];
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
setp.lt.s32 %p75, %r25, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
mov.u32 %r601, %r604;
$L__BB0_45:
setp.ge.u32 %p76, %r8, %r601;
@%p76 bra $L__BB0_47;
mad.lo.s32 %r320, %r601, %r3, %r24;
mul.wide.s32 %rd86, %r320, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
shr.u32 %r28, %r601, 1;
setp.gt.u32 %p77, %r601, 3;
mov.u32 %r601, %r28;
@%p77 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r321, %r24, %r3;
mul.wide.u32 %rd89, %r321, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f64 %fd219, [%rd22];
add.f64 %fd380, %fd219, 0d0000000000000000;
@%p79 bra $L__BB0_51;
ld.shared.f64 %fd220, [%rd24];
add.f64 %fd380, %fd380, %fd220;
$L__BB0_51:
bar.sync 0;
st.shared.f64 [%rd22], %fd369;
bar.sync 0;
@%p74 bra $L__BB0_53;
ld.shared.f64 %fd221, [%rd23];
ld.shared.f64 %fd222, [%rd22];
add.f64 %fd223, %fd221, %fd222;
st.shared.f64 [%rd22], %fd223;
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
mov.u32 %r602, %r604;
$L__BB0_55:
setp.ge.u32 %p82, %r8, %r602;
@%p82 bra $L__BB0_57;
mad.lo.s32 %r322, %r602, %r3, %r24;
mul.wide.s32 %rd91, %r322, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
shr.u32 %r30, %r602, 1;
setp.gt.u32 %p83, %r602, 3;
mov.u32 %r602, %r30;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f64 %fd228, [%rd22];
add.f64 %fd381, %fd228, 0d0000000000000000;
@%p85 bra $L__BB0_61;
ld.shared.f64 %fd229, [%rd24];
add.f64 %fd381, %fd381, %fd229;
$L__BB0_61:
bar.sync 0;
st.shared.f64 [%rd22], %fd378;
bar.sync 0;
@%p74 bra $L__BB0_63;
ld.shared.f64 %fd230, [%rd23];
ld.shared.f64 %fd231, [%rd22];
add.f64 %fd232, %fd230, %fd231;
st.shared.f64 [%rd22], %fd232;
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
mov.u32 %r603, %r604;
$L__BB0_65:
setp.ge.u32 %p88, %r8, %r603;
@%p88 bra $L__BB0_67;
mad.lo.s32 %r323, %r603, %r3, %r24;
mul.wide.s32 %rd94, %r323, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
shr.u32 %r32, %r603, 1;
setp.gt.u32 %p89, %r603, 3;
mov.u32 %r603, %r32;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f64 %fd237, [%rd22];
add.f64 %fd382, %fd237, 0d0000000000000000;
@%p91 bra $L__BB0_71;
ld.shared.f64 %fd238, [%rd24];
add.f64 %fd382, %fd382, %fd238;
$L__BB0_71:
bar.sync 0;
st.shared.f64 [%rd22], %fd379;
bar.sync 0;
@%p74 bra $L__BB0_73;
ld.shared.f64 %fd239, [%rd23];
ld.shared.f64 %fd240, [%rd22];
add.f64 %fd241, %fd239, %fd240;
st.shared.f64 [%rd22], %fd241;
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
setp.ge.u32 %p94, %r8, %r604;
@%p94 bra $L__BB0_76;
mad.lo.s32 %r324, %r604, %r3, %r24;
mul.wide.s32 %rd97, %r324, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
shr.u32 %r34, %r604, 1;
setp.gt.u32 %p95, %r604, 3;
mov.u32 %r604, %r34;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
setp.lt.u32 %p97, %r4, 2;
ld.shared.f64 %fd246, [%rd22];
add.f64 %fd383, %fd246, 0d0000000000000000;
@%p97 bra $L__BB0_80;
ld.shared.f64 %fd247, [%rd24];
add.f64 %fd383, %fd383, %fd247;
$L__BB0_80:
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
mov.u32 %r333, %ctaid.y;
mad.lo.s32 %r334, %r107, %r333, %r7;
mul.wide.s32 %rd102, %r334, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
mov.b64 {%r325, %r326}, %rd103;
mov.b64 %rd104, %fd381;
mov.b64 {%r327, %r328}, %rd104;
// begin inline asm
st.volatile.global.v4.s32 [%rd100], {%r325,%r326,%r327,%r328};
// end inline asm
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
mov.b64 {%r329, %r330}, %rd105;
mov.b64 %rd106, %fd383;
mov.b64 {%r331, %r332}, %rd106;
// begin inline asm
st.volatile.global.v4.s32 [%rd101], {%r329,%r330,%r331,%r332};
// end inline asm
$L__BB0_82:
mov.u32 %r35, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r335, %r6, %r8;
or.b32 %r337, %r335, %r310;
setp.ne.s32 %p98, %r337, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
mov.u32 %r338, %ctaid.x;
mov.u32 %r339, %ctaid.z;
mov.u32 %r340, %nctaid.x;
mad.lo.s32 %r341, %r339, %r340, %r338;
mul.wide.s32 %rd108, %r341, 8;
add.s64 %rd27, %rd107, %rd108;
add.s32 %r342, %r9, -1;
setp.eq.s32 %p99, %r35, %r342;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
mov.u32 %r605, 8;
$L__BB0_85:
// begin inline asm
nanosleep.u32 %r605;
// end inline asm
setp.lt.u32 %p101, %r605, 256;
selp.u32 %r345, 1, 0, %p101;
shl.b32 %r605, %r605, %r345;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
add.s32 %r346, %r4, %r2;
add.s32 %r347, %r346, -1;
div.s32 %r348, %r347, %r4;
add.s32 %r349, %r9, %r348;
add.s32 %r350, %r349, -1;
div.s32 %r38, %r350, %r9;
setp.lt.s32 %p103, %r38, 1;
@%p103 bra $L__BB0_133;
add.s32 %r352, %r9, %r3;
add.s32 %r353, %r352, -1;
shl.b32 %r39, %r8, 1;
shl.b32 %r354, %r4, 1;
mad.lo.s32 %r42, %r354, %r35, %r39;
or.b32 %r40, %r42, 1;
mul.lo.s32 %r41, %r354, %r9;
shr.u32 %r43, %r3, 5;
mul.lo.s32 %r355, %r23, %r43;
shr.u32 %r44, %r6, 5;
add.s32 %r356, %r355, %r44;
mul.wide.u32 %rd117, %r356, 8;
add.s64 %rd29, %rd44, %rd117;
and.b32 %r45, %r6, 31;
add.s32 %r357, %r355, %r45;
mul.wide.u32 %rd119, %r357, 8;
add.s64 %rd30, %rd44, %rd119;
div.s32 %r46, %r353, %r3;
mov.u32 %r606, 0;
$L__BB0_88:
.pragma "nounroll";
setp.lt.s32 %p104, %r46, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
mul.lo.s32 %r359, %r41, %r606;
add.s32 %r48, %r40, %r359;
add.s32 %r49, %r42, %r359;
mov.u32 %r607, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
setp.ge.s32 %p105, %r48, %r107;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
mad.lo.s32 %r51, %r607, %r3, %r6;
setp.ge.s32 %p106, %r51, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
mad.lo.s32 %r364, %r51, %r107, %r49;
mul.wide.s32 %rd121, %r364, 8;
add.s64 %rd120, %rd42, %rd121;
// begin inline asm
ld.volatile.global.v4.s32 {%r360,%r361,%r362,%r363}, [%rd120];
// end inline asm
mov.b64 %rd122, {%r360, %r361};
mov.b64 %fd387, %rd122;
mov.b64 %rd123, {%r362, %r363};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
add.s32 %r607, %r607, 1;
setp.lt.s32 %p107, %r607, %r46;
@%p107 bra $L__BB0_90;
$L__BB0_94:
// begin inline asm
mov.b64 {%r365,%r366}, %fd389;
// end inline asm
mov.u32 %r385, 31;
mov.u32 %r386, 16;
mov.u32 %r387, -1;
shfl.sync.bfly.b32 %r368|%p108, %r366, %r386, %r385, %r387;
shfl.sync.bfly.b32 %r367|%p109, %r365, %r386, %r385, %r387;
// begin inline asm
mov.b64 %fd257, {%r367,%r368};
// end inline asm
add.f64 %fd258, %fd389, %fd257;
// begin inline asm
mov.b64 {%r369,%r370}, %fd258;
// end inline asm
mov.u32 %r388, 8;
shfl.sync.bfly.b32 %r372|%p110, %r370, %r388, %r385, %r387;
shfl.sync.bfly.b32 %r371|%p111, %r369, %r388, %r385, %r387;
// begin inline asm
mov.b64 %fd259, {%r371,%r372};
// end inline asm
add.f64 %fd260, %fd258, %fd259;
// begin inline asm
mov.b64 {%r373,%r374}, %fd260;
// end inline asm
mov.u32 %r389, 4;
shfl.sync.bfly.b32 %r376|%p112, %r374, %r389, %r385, %r387;
shfl.sync.bfly.b32 %r375|%p113, %r373, %r389, %r385, %r387;
// begin inline asm
mov.b64 %fd261, {%r375,%r376};
// end inline asm
add.f64 %fd262, %fd260, %fd261;
// begin inline asm
mov.b64 {%r377,%r378}, %fd262;
// end inline asm
mov.u32 %r390, 2;
shfl.sync.bfly.b32 %r380|%p114, %r378, %r390, %r385, %r387;
shfl.sync.bfly.b32 %r379|%p115, %r377, %r390, %r385, %r387;
// begin inline asm
mov.b64 %fd263, {%r379,%r380};
// end inline asm
add.f64 %fd264, %fd262, %fd263;
// begin inline asm
mov.b64 {%r381,%r382}, %fd264;
// end inline asm
mov.u32 %r391, 1;
shfl.sync.bfly.b32 %r384|%p116, %r382, %r391, %r385, %r387;
shfl.sync.bfly.b32 %r383|%p117, %r381, %r391, %r385, %r387;
// begin inline asm
mov.b64 %fd265, {%r383,%r384};
// end inline asm
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
setp.ne.s32 %p118, %r45, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
setp.ne.s32 %p119, %r44, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
setp.ge.u32 %p120, %r45, %r43;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
// begin inline asm
mov.b64 {%r392,%r393}, %fd390;
// end inline asm
mov.u32 %r412, 31;
mov.u32 %r413, 16;
mov.u32 %r414, -1;
shfl.sync.bfly.b32 %r395|%p121, %r393, %r413, %r412, %r414;
shfl.sync.bfly.b32 %r394|%p122, %r392, %r413, %r412, %r414;
// begin inline asm
mov.b64 %fd268, {%r394,%r395};
// end inline asm
add.f64 %fd269, %fd390, %fd268;
// begin inline asm
mov.b64 {%r396,%r397}, %fd269;
// end inline asm
mov.u32 %r415, 8;
shfl.sync.bfly.b32 %r399|%p123, %r397, %r415, %r412, %r414;
shfl.sync.bfly.b32 %r398|%p124, %r396, %r415, %r412, %r414;
// begin inline asm
mov.b64 %fd270, {%r398,%r399};
// end inline asm
add.f64 %fd271, %fd269, %fd270;
// begin inline asm
mov.b64 {%r400,%r401}, %fd271;
// end inline asm
mov.u32 %r416, 4;
shfl.sync.bfly.b32 %r403|%p125, %r401, %r416, %r412, %r414;
shfl.sync.bfly.b32 %r402|%p126, %r400, %r416, %r412, %r414;
// begin inline asm
mov.b64 %fd272, {%r402,%r403};
// end inline asm
add.f64 %fd273, %fd271, %fd272;
// begin inline asm
mov.b64 {%r404,%r405}, %fd273;
// end inline asm
mov.u32 %r417, 2;
shfl.sync.bfly.b32 %r407|%p127, %r405, %r417, %r412, %r414;
shfl.sync.bfly.b32 %r406|%p128, %r404, %r417, %r412, %r414;
// begin inline asm
mov.b64 %fd274, {%r406,%r407};
// end inline asm
add.f64 %fd275, %fd273, %fd274;
// begin inline asm
mov.b64 {%r408,%r409}, %fd275;
// end inline asm
mov.u32 %r418, 1;
shfl.sync.bfly.b32 %r411|%p129, %r409, %r418, %r412, %r414;
shfl.sync.bfly.b32 %r410|%p130, %r408, %r418, %r412, %r414;
// begin inline asm
mov.b64 %fd276, {%r410,%r411};
// end inline asm
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
setp.eq.s32 %p132, %r45, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r419,%r420}, %fd388;
// end inline asm
mov.u32 %r439, 31;
mov.u32 %r440, 16;
mov.u32 %r441, -1;
shfl.sync.bfly.b32 %r422|%p133, %r420, %r440, %r439, %r441;
shfl.sync.bfly.b32 %r421|%p134, %r419, %r440, %r439, %r441;
// begin inline asm
mov.b64 %fd278, {%r421,%r422};
// end inline asm
add.f64 %fd279, %fd388, %fd278;
// begin inline asm
mov.b64 {%r423,%r424}, %fd279;
// end inline asm
mov.u32 %r442, 8;
shfl.sync.bfly.b32 %r426|%p135, %r424, %r442, %r439, %r441;
shfl.sync.bfly.b32 %r425|%p136, %r423, %r442, %r439, %r441;
// begin inline asm
mov.b64 %fd280, {%r425,%r426};
// end inline asm
add.f64 %fd281, %fd279, %fd280;
// begin inline asm
mov.b64 {%r427,%r428}, %fd281;
// end inline asm
mov.u32 %r443, 4;
shfl.sync.bfly.b32 %r430|%p137, %r428, %r443, %r439, %r441;
shfl.sync.bfly.b32 %r429|%p138, %r427, %r443, %r439, %r441;
// begin inline asm
mov.b64 %fd282, {%r429,%r430};
// end inline asm
add.f64 %fd283, %fd281, %fd282;
// begin inline asm
mov.b64 {%r431,%r432}, %fd283;
// end inline asm
mov.u32 %r444, 2;
shfl.sync.bfly.b32 %r434|%p139, %r432, %r444, %r439, %r441;
shfl.sync.bfly.b32 %r433|%p140, %r431, %r444, %r439, %r441;
// begin inline asm
mov.b64 %fd284, {%r433,%r434};
// end inline asm
add.f64 %fd285, %fd283, %fd284;
// begin inline asm
mov.b64 {%r435,%r436}, %fd285;
// end inline asm
mov.u32 %r445, 1;
shfl.sync.bfly.b32 %r438|%p141, %r436, %r445, %r439, %r441;
shfl.sync.bfly.b32 %r437|%p142, %r435, %r445, %r439, %r441;
// begin inline asm
mov.b64 %fd286, {%r437,%r438};
// end inline asm
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
st.shared.f64 [%rd29], %fd393;
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
setp.ge.u32 %p144, %r45, %r43;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
// begin inline asm
mov.b64 {%r446,%r447}, %fd392;
// end inline asm
mov.u32 %r466, 31;
mov.u32 %r467, 16;
mov.u32 %r468, -1;
shfl.sync.bfly.b32 %r449|%p145, %r447, %r467, %r466, %r468;
shfl.sync.bfly.b32 %r448|%p146, %r446, %r467, %r466, %r468;
// begin inline asm
mov.b64 %fd290, {%r448,%r449};
// end inline asm
add.f64 %fd291, %fd392, %fd290;
// begin inline asm
mov.b64 {%r450,%r451}, %fd291;
// end inline asm
mov.u32 %r469, 8;
shfl.sync.bfly.b32 %r453|%p147, %r451, %r469, %r466, %r468;
shfl.sync.bfly.b32 %r452|%p148, %r450, %r469, %r466, %r468;
// begin inline asm
mov.b64 %fd292, {%r452,%r453};
// end inline asm
add.f64 %fd293, %fd291, %fd292;
// begin inline asm
mov.b64 {%r454,%r455}, %fd293;
// end inline asm
mov.u32 %r470, 4;
shfl.sync.bfly.b32 %r457|%p149, %r455, %r470, %r466, %r468;
shfl.sync.bfly.b32 %r456|%p150, %r454, %r470, %r466, %r468;
// begin inline asm
mov.b64 %fd294, {%r456,%r457};
// end inline asm
add.f64 %fd295, %fd293, %fd294;
// begin inline asm
mov.b64 {%r458,%r459}, %fd295;
// end inline asm
mov.u32 %r471, 2;
shfl.sync.bfly.b32 %r461|%p151, %r459, %r471, %r466, %r468;
shfl.sync.bfly.b32 %r460|%p152, %r458, %r471, %r466, %r468;
// begin inline asm
mov.b64 %fd296, {%r460,%r461};
// end inline asm
add.f64 %fd297, %fd295, %fd296;
// begin inline asm
mov.b64 {%r462,%r463}, %fd297;
// end inline asm
mov.u32 %r472, 1;
shfl.sync.bfly.b32 %r465|%p153, %r463, %r472, %r466, %r468;
shfl.sync.bfly.b32 %r464|%p154, %r462, %r472, %r466, %r468;
// begin inline asm
mov.b64 %fd298, {%r464,%r465};
// end inline asm
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
mul.lo.s32 %r53, %r41, %r606;
add.s32 %r473, %r40, %r53;
setp.ge.s32 %p156, %r473, %r107;
@%p156 bra $L__BB0_109;
add.s32 %r478, %r42, %r53;
mul.wide.s32 %rd125, %r478, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
mov.b64 {%r474, %r475}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
mov.b64 {%r476, %r477}, %rd127;
// begin inline asm
st.global.cs.v4.s32 [%rd124], {%r474,%r475,%r476,%r477};
// end inline asm
$L__BB0_109:
add.s32 %r606, %r606, 1;
setp.lt.s32 %p158, %r606, %r38;
@%p158 bra $L__BB0_88;
mad.lo.s32 %r55, %r107, %r6, %r39;
shl.b32 %r56, %r35, 1;
shl.b32 %r57, %r9, 1;
mul.lo.s32 %r58, %r107, %r3;
mov.u32 %r608, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
mad.lo.s32 %r60, %r41, %r608, %r40;
mad.lo.s32 %r481, %r57, %r608, %r56;
mad.lo.s32 %r610, %r4, %r481, %r55;
mov.u32 %r611, 0;
mov.f64 %fd304, 0d0000000000000000;
mov.u32 %r609, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
setp.ge.s32 %p160, %r60, %r107;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
setp.ge.s32 %p161, %r609, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
mul.wide.s32 %rd129, %r610, 8;
add.s64 %rd128, %rd41, %rd129;
// begin inline asm
ld.volatile.global.v4.s32 {%r482,%r483,%r484,%r485}, [%rd128];
// end inline asm
mov.b64 %rd130, {%r482, %r483};
mov.b64 %fd397, %rd130;
mov.b64 %rd131, {%r484, %r485};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
add.s32 %r610, %r610, %r58;
add.s32 %r609, %r609, %r3;
add.s32 %r611, %r611, 1;
setp.lt.s32 %p162, %r611, %r46;
@%p162 bra $L__BB0_113;
$L__BB0_117:
// begin inline asm
mov.b64 {%r486,%r487}, %fd399;
// end inline asm
mov.u32 %r506, 31;
mov.u32 %r507, 16;
mov.u32 %r508, -1;
shfl.sync.bfly.b32 %r489|%p163, %r487, %r507, %r506, %r508;
shfl.sync.bfly.b32 %r488|%p164, %r486, %r507, %r506, %r508;
// begin inline asm
mov.b64 %fd310, {%r488,%r489};
// end inline asm
add.f64 %fd311, %fd399, %fd310;
// begin inline asm
mov.b64 {%r490,%r491}, %fd311;
// end inline asm
mov.u32 %r509, 8;
shfl.sync.bfly.b32 %r493|%p165, %r491, %r509, %r506, %r508;
shfl.sync.bfly.b32 %r492|%p166, %r490, %r509, %r506, %r508;
// begin inline asm
mov.b64 %fd312, {%r492,%r493};
// end inline asm
add.f64 %fd313, %fd311, %fd312;
// begin inline asm
mov.b64 {%r494,%r495}, %fd313;
// end inline asm
mov.u32 %r510, 4;
shfl.sync.bfly.b32 %r497|%p167, %r495, %r510, %r506, %r508;
shfl.sync.bfly.b32 %r496|%p168, %r494, %r510, %r506, %r508;
// begin inline asm
mov.b64 %fd314, {%r496,%r497};
// end inline asm
add.f64 %fd315, %fd313, %fd314;
// begin inline asm
mov.b64 {%r498,%r499}, %fd315;
// end inline asm
mov.u32 %r511, 2;
shfl.sync.bfly.b32 %r501|%p169, %r499, %r511, %r506, %r508;
shfl.sync.bfly.b32 %r500|%p170, %r498, %r511, %r506, %r508;
// begin inline asm
mov.b64 %fd316, {%r500,%r501};
// end inline asm
add.f64 %fd317, %fd315, %fd316;
// begin inline asm
mov.b64 {%r502,%r503}, %fd317;
// end inline asm
mov.u32 %r512, 1;
shfl.sync.bfly.b32 %r505|%p171, %r503, %r512, %r506, %r508;
shfl.sync.bfly.b32 %r504|%p172, %r502, %r512, %r506, %r508;
// begin inline asm
mov.b64 %fd318, {%r504,%r505};
// end inline asm
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
st.shared.f64 [%rd29], %fd401;
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
setp.ge.u32 %p175, %r45, %r43;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
// begin inline asm
mov.b64 {%r513,%r514}, %fd400;
// end inline asm
mov.u32 %r533, 31;
mov.u32 %r534, 16;
mov.u32 %r535, -1;
shfl.sync.bfly.b32 %r516|%p176, %r514, %r534, %r533, %r535;
shfl.sync.bfly.b32 %r515|%p177, %r513, %r534, %r533, %r535;
// begin inline asm
mov.b64 %fd321, {%r515,%r516};
// end inline asm
add.f64 %fd322, %fd400, %fd321;
// begin inline asm
mov.b64 {%r517,%r518}, %fd322;
// end inline asm
mov.u32 %r536, 8;
shfl.sync.bfly.b32 %r520|%p178, %r518, %r536, %r533, %r535;
shfl.sync.bfly.b32 %r519|%p179, %r517, %r536, %r533, %r535;
// begin inline asm
mov.b64 %fd323, {%r519,%r520};
// end inline asm
add.f64 %fd324, %fd322, %fd323;
// begin inline asm
mov.b64 {%r521,%r522}, %fd324;
// end inline asm
mov.u32 %r537, 4;
shfl.sync.bfly.b32 %r524|%p180, %r522, %r537, %r533, %r535;
shfl.sync.bfly.b32 %r523|%p181, %r521, %r537, %r533, %r535;
// begin inline asm
mov.b64 %fd325, {%r523,%r524};
// end inline asm
add.f64 %fd326, %fd324, %fd325;
// begin inline asm
mov.b64 {%r525,%r526}, %fd326;
// end inline asm
mov.u32 %r538, 2;
shfl.sync.bfly.b32 %r528|%p182, %r526, %r538, %r533, %r535;
shfl.sync.bfly.b32 %r527|%p183, %r525, %r538, %r533, %r535;
// begin inline asm
mov.b64 %fd327, {%r527,%r528};
// end inline asm
add.f64 %fd328, %fd326, %fd327;
// begin inline asm
mov.b64 {%r529,%r530}, %fd328;
// end inline asm
mov.u32 %r539, 1;
shfl.sync.bfly.b32 %r532|%p184, %r530, %r539, %r533, %r535;
shfl.sync.bfly.b32 %r531|%p185, %r529, %r539, %r533, %r535;
// begin inline asm
mov.b64 %fd329, {%r531,%r532};
// end inline asm
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
// begin inline asm
mov.b64 {%r540,%r541}, %fd398;
// end inline asm
mov.u32 %r560, 31;
mov.u32 %r561, 16;
mov.u32 %r562, -1;
shfl.sync.bfly.b32 %r543|%p188, %r541, %r561, %r560, %r562;
shfl.sync.bfly.b32 %r542|%p189, %r540, %r561, %r560, %r562;
// begin inline asm
mov.b64 %fd331, {%r542,%r543};
// end inline asm
add.f64 %fd332, %fd398, %fd331;
// begin inline asm
mov.b64 {%r544,%r545}, %fd332;
// end inline asm
mov.u32 %r563, 8;
shfl.sync.bfly.b32 %r547|%p190, %r545, %r563, %r560, %r562;
shfl.sync.bfly.b32 %r546|%p191, %r544, %r563, %r560, %r562;
// begin inline asm
mov.b64 %fd333, {%r546,%r547};
// end inline asm
add.f64 %fd334, %fd332, %fd333;
// begin inline asm
mov.b64 {%r548,%r549}, %fd334;
// end inline asm
mov.u32 %r564, 4;
shfl.sync.bfly.b32 %r551|%p192, %r549, %r564, %r560, %r562;
shfl.sync.bfly.b32 %r550|%p193, %r548, %r564, %r560, %r562;
// begin inline asm
mov.b64 %fd335, {%r550,%r551};
// end inline asm
add.f64 %fd336, %fd334, %fd335;
// begin inline asm
mov.b64 {%r552,%r553}, %fd336;
// end inline asm
mov.u32 %r565, 2;
shfl.sync.bfly.b32 %r555|%p194, %r553, %r565, %r560, %r562;
shfl.sync.bfly.b32 %r554|%p195, %r552, %r565, %r560, %r562;
// begin inline asm
mov.b64 %fd337, {%r554,%r555};
// end inline asm
add.f64 %fd338, %fd336, %fd337;
// begin inline asm
mov.b64 {%r556,%r557}, %fd338;
// end inline asm
mov.u32 %r566, 1;
shfl.sync.bfly.b32 %r559|%p196, %r557, %r566, %r560, %r562;
shfl.sync.bfly.b32 %r558|%p197, %r556, %r566, %r560, %r562;
// begin inline asm
mov.b64 %fd339, {%r558,%r559};
// end inline asm
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
st.shared.f64 [%rd29], %fd403;
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
setp.ge.u32 %p199, %r45, %r43;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
// begin inline asm
mov.b64 {%r567,%r568}, %fd402;
// end inline asm
mov.u32 %r587, 31;
mov.u32 %r588, 16;
mov.u32 %r589, -1;
shfl.sync.bfly.b32 %r570|%p200, %r568, %r588, %r587, %r589;
shfl.sync.bfly.b32 %r569|%p201, %r567, %r588, %r587, %r589;
// begin inline asm
mov.b64 %fd343, {%r569,%r570};
// end inline asm
add.f64 %fd344, %fd402, %fd343;
// begin inline asm
mov.b64 {%r571,%r572}, %fd344;
// end inline asm
mov.u32 %r590, 8;
shfl.sync.bfly.b32 %r574|%p202, %r572, %r590, %r587, %r589;
shfl.sync.bfly.b32 %r573|%p203, %r571, %r590, %r587, %r589;
// begin inline asm
mov.b64 %fd345, {%r573,%r574};
// end inline asm
add.f64 %fd346, %fd344, %fd345;
// begin inline asm
mov.b64 {%r575,%r576}, %fd346;
// end inline asm
mov.u32 %r591, 4;
shfl.sync.bfly.b32 %r578|%p204, %r576, %r591, %r587, %r589;
shfl.sync.bfly.b32 %r577|%p205, %r575, %r591, %r587, %r589;
// begin inline asm
mov.b64 %fd347, {%r577,%r578};
// end inline asm
add.f64 %fd348, %fd346, %fd347;
// begin inline asm
mov.b64 {%r579,%r580}, %fd348;
// end inline asm
mov.u32 %r592, 2;
shfl.sync.bfly.b32 %r582|%p206, %r580, %r592, %r587, %r589;
shfl.sync.bfly.b32 %r581|%p207, %r579, %r592, %r587, %r589;
// begin inline asm
mov.b64 %fd349, {%r581,%r582};
// end inline asm
add.f64 %fd350, %fd348, %fd349;
// begin inline asm
mov.b64 {%r583,%r584}, %fd350;
// end inline asm
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r586|%p208, %r584, %r593, %r587, %r589;
shfl.sync.bfly.b32 %r585|%p209, %r583, %r593, %r587, %r589;
// begin inline asm
mov.b64 %fd351, {%r585,%r586};
// end inline asm
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
mul.lo.s32 %r68, %r41, %r608;
add.s32 %r594, %r40, %r68;
setp.ge.s32 %p211, %r594, %r107;
@%p211 bra $L__BB0_132;
add.s32 %r599, %r42, %r68;
mul.wide.s32 %rd133, %r599, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
mov.b64 {%r595, %r596}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
mov.b64 {%r597, %r598}, %rd135;
// begin inline asm
st.global.cs.v4.s32 [%rd132], {%r595,%r596,%r597,%r598};
// end inline asm
$L__BB0_132:
add.s32 %r608, %r608, 1;
setp.lt.s32 %p213, %r608, %r38;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -29,18 +29,18 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<214>;
- .reg .b32 %r<613>;
+ .reg .b32 %r<612>;
.reg .f64 %fd<404>;
.reg .b64 %rd<136>;
- ld.param.v2.u32 {%r107, %r108}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r117, %r118}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r121, %r122}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r106, %r107}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r116, %r117}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r120, %r121}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
@@ -48,113 +48,113 @@
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEES1_S1_S1_NS0_IdLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r143, %r108, 1;
- shr.u32 %r144, %r143, 31;
- add.s32 %r145, %r143, %r144;
- shr.s32 %r2, %r145, 1;
+ add.s32 %r142, %r107, 1;
+ shr.u32 %r143, %r142, 31;
+ add.s32 %r144, %r142, %r143;
+ shr.s32 %r2, %r144, 1;
mov.u32 %r3, %ntid.x;
- max.s32 %r146, %r2, %r3;
- add.s32 %r147, %r146, 31;
- shr.s32 %r148, %r147, 31;
- shr.u32 %r149, %r148, 27;
- add.s32 %r150, %r147, %r149;
- shr.u32 %r151, %r150, 5;
+ max.s32 %r145, %r2, %r3;
+ add.s32 %r146, %r145, 31;
+ shr.s32 %r147, %r146, 31;
+ shr.u32 %r148, %r147, 27;
+ add.s32 %r149, %r146, %r148;
+ shr.u32 %r150, %r149, 5;
mov.u32 %r4, %ntid.y;
- mul.lo.s32 %r152, %r4, %r151;
- shl.b32 %r153, %r152, 8;
- cvt.u64.u32 %rd1, %r153;
- mul.lo.s32 %r154, %r4, %r2;
- shl.b32 %r155, %r154, 4;
- or.b32 %r156, %r155, 15;
- and.b32 %r5, %r156, -16;
- add.s32 %r157, %r156, %r5;
- and.b32 %r158, %r157, -16;
- cvt.s64.s32 %rd2, %r158;
+ mul.lo.s32 %r151, %r4, %r150;
+ shl.b32 %r152, %r151, 8;
+ cvt.u64.u32 %rd1, %r152;
+ mul.lo.s32 %r153, %r4, %r2;
+ shl.b32 %r154, %r153, 4;
+ or.b32 %r155, %r154, 15;
+ and.b32 %r5, %r155, -16;
+ add.s32 %r156, %r155, %r5;
+ and.b32 %r157, %r156, -16;
+ cvt.s64.s32 %rd2, %r157;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
- cvt.rn.f64.s32 %fd1, %r108;
+ cvt.rn.f64.s32 %fd1, %r107;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 1;
- or.b32 %r159, %r7, 1;
- setp.lt.s32 %p7, %r159, %r108;
+ or.b32 %r158, %r7, 1;
+ setp.lt.s32 %p7, %r158, %r107;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r160, smem_ptr; }
-
-
- shl.b32 %r163, %r6, 4;
- add.s32 %r161, %r160, %r163;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r159, smem_ptr; }
+
+
+ shl.b32 %r162, %r6, 4;
+ add.s32 %r160, %r159, %r162;
mul.wide.s32 %rd48, %r7, 8;
add.s64 %rd47, %rd37, %rd48;
- mov.u32 %r162, 0;
+ mov.u32 %r161, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r162, 0;
- cp.async.ca.shared.global [%r161], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r161, 0;
+ cp.async.ca.shared.global [%r160], [%rd47], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r164, %r4, 215;
- div.s32 %r165, %r164, %r4;
+ add.s32 %r163, %r4, 215;
+ div.s32 %r164, %r163, %r4;
mov.u32 %r9, %nctaid.y;
- add.s32 %r166, %r9, %r165;
- add.s32 %r167, %r166, -1;
- div.s32 %r10, %r167, %r9;
+ add.s32 %r165, %r9, %r164;
+ add.s32 %r166, %r165, -1;
+ div.s32 %r10, %r166, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
- mov.u32 %r169, %ctaid.y;
- mul.lo.s32 %r170, %r10, %r4;
- mul.lo.s32 %r11, %r170, %r169;
- shl.b32 %r171, %r8, 3;
- shl.b32 %r172, %r6, 4;
- mad.lo.s32 %r12, %r171, %r108, %r172;
- mul.lo.s32 %r173, %r108, %r8;
- cvt.s64.s32 %rd53, %r173;
+ mov.u32 %r168, %ctaid.y;
+ mul.lo.s32 %r169, %r10, %r4;
+ mul.lo.s32 %r11, %r169, %r168;
+ mad.lo.s32 %r170, %r2, %r8, %r6;
+ shl.b32 %r12, %r170, 4;
+ mul.lo.s32 %r171, %r107, %r8;
+ cvt.s64.s32 %rd53, %r171;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r174, %r11, %r108;
- cvt.s64.s32 %rd6, %r174;
- mul.lo.s32 %r13, %r108, %r4;
- mul.lo.s32 %r14, %r10, %r169;
- add.s32 %r15, %r173, %r7;
+ mul.lo.s32 %r172, %r11, %r107;
+ cvt.s64.s32 %rd6, %r172;
+ mul.lo.s32 %r13, %r107, %r4;
+ mul.lo.s32 %r14, %r10, %r168;
+ shl.b32 %r173, %r8, 1;
+ mad.lo.s32 %r174, %r173, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 8;
+ mul.wide.s32 %rd56, %r174, 8;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r175, %tid.z;
mad.lo.s32 %r176, %r175, %r4, %r8;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r177, %r176, %r16;
- shr.u32 %r17, %r6, 5;
- add.s32 %r178, %r177, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r177, %r176, %r15;
+ shr.u32 %r16, %r6, 5;
+ add.s32 %r178, %r177, %r16;
mul.wide.u32 %rd57, %r178, 8;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
- and.b32 %r18, %r6, 31;
- add.s32 %r179, %r177, %r18;
+ and.b32 %r17, %r6, 31;
+ add.s32 %r179, %r177, %r17;
mul.wide.u32 %rd58, %r179, 8;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 8;
add.s64 %rd11, %rd59, %rd60;
@@ -162,37 +162,37 @@
add.s64 %rd12, %rd44, %rd61;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
rcp.rn.f64 %fd2, %fd1;
- mov.u32 %r601, 0;
+ mov.u32 %r600, 0;
mov.f64 %fd368, 0d0000000000000000;
not.pred %p11, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r182, smem_ptr; }
- add.s32 %r183, %r12, %r182;
+ add.s32 %r183, %r182, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r192, smem_ptr; }
- add.s32 %r193, %r12, %r192;
+ add.s32 %r193, %r192, %r12;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
- mad.lo.s32 %r180, %r601, %r4, %r8;
+ mad.lo.s32 %r180, %r600, %r4, %r8;
add.s32 %r181, %r180, %r11;
setp.gt.s32 %p12, %r181, 215;
@%p12 bra $L__BB0_8;
- mul.lo.s32 %r185, %r13, %r601;
+ mul.lo.s32 %r185, %r13, %r600;
cvt.s64.s32 %rd65, %r185;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 3;
add.s64 %rd64, %rd33, %rd68;
@@ -211,11 +211,11 @@
cp.async.wait_all;
@%p11 bra $L__BB0_10;
- add.s32 %r186, %r14, %r601;
+ add.s32 %r186, %r14, %r600;
mad.lo.s32 %r187, %r186, %r4, %r8;
setp.lt.s32 %p14, %r187, 216;
@%p14 bra $L__BB0_13;
bra.uni $L__BB0_10;
@@ -228,38 +228,38 @@
mov.f64 %fd365, 0d0000000000000000;
@%p1 bra $L__BB0_11;
bra.uni $L__BB0_14;
$L__BB0_11:
- add.s32 %r188, %r14, %r601;
+ add.s32 %r188, %r14, %r600;
mad.lo.s32 %r189, %r188, %r4, %r8;
setp.gt.s32 %p15, %r189, 215;
@%p15 bra $L__BB0_14;
ld.shared.v2.f64 {%fd364, %fd365}, [%rd7];
$L__BB0_14:
- add.s32 %r190, %r14, %r601;
- mad.lo.s32 %r22, %r190, %r4, %r8;
+ add.s32 %r190, %r14, %r600;
+ mad.lo.s32 %r21, %r190, %r4, %r8;
add.f64 %fd379, %fd379, %fd365;
add.f64 %fd378, %fd378, %fd364;
- setp.gt.s32 %p16, %r22, 215;
+ setp.gt.s32 %p16, %r21, 215;
mov.f64 %fd366, 0d0000000000000000;
@%p16 bra $L__BB0_16;
- mul.lo.s32 %r191, %r22, %r117;
+ mul.lo.s32 %r191, %r21, %r116;
mul.wide.s32 %rd69, %r191, 8;
add.s64 %rd70, %rd15, %rd69;
ld.global.f64 %fd366, [%rd70];
$L__BB0_16:
- setp.lt.s32 %p17, %r22, 216;
+ setp.lt.s32 %p17, %r21, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_18;
- mul.lo.s32 %r195, %r13, %r601;
+ mul.lo.s32 %r195, %r13, %r600;
cvt.s64.s32 %rd73, %r195;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 3;
add.s64 %rd72, %rd34, %rd76;
@@ -276,11 +276,11 @@
$L__BB0_18:
mov.f64 %fd370, 0d0000000000000000;
mov.f64 %fd367, %fd370;
@%p16 bra $L__BB0_20;
- mul.lo.s32 %r196, %r22, %r121;
+ mul.lo.s32 %r196, %r21, %r120;
mul.wide.s32 %rd77, %r196, 8;
add.s64 %rd78, %rd16, %rd77;
ld.global.f64 %fd367, [%rd78];
$L__BB0_20:
@@ -359,21 +359,21 @@
mov.b64 %fd147, {%r215,%r216};
add.f64 %fd373, %fd146, %fd147;
bar.sync 0;
- setp.ne.s32 %p31, %r18, 0;
+ setp.ne.s32 %p31, %r17, 0;
@%p31 bra $L__BB0_24;
st.shared.f64 [%rd8], %fd373;
$L__BB0_24:
- setp.ne.s32 %p32, %r17, 0;
+ setp.ne.s32 %p32, %r16, 0;
bar.sync 0;
@%p32 bra $L__BB0_28;
- setp.ge.u32 %p33, %r18, %r16;
+ setp.ge.u32 %p33, %r17, %r15;
mov.f64 %fd372, 0d0000000000000000;
@%p33 bra $L__BB0_27;
ld.shared.f64 %fd372, [%rd10];
@@ -483,11 +483,11 @@
shfl.sync.bfly.b32 %r269|%p53, %r267, %r277, %r271, %r273;
mov.b64 %fd168, {%r269,%r270};
add.f64 %fd375, %fd167, %fd168;
- setp.eq.s32 %p4, %r18, 0;
+ setp.eq.s32 %p4, %r17, 0;
bar.sync 0;
@%p31 bra $L__BB0_30;
st.shared.f64 [%rd8], %fd375;
@@ -495,11 +495,11 @@
bar.sync 0;
add.f64 %fd169, %fd373, 0d0000000000000000;
selp.f64 %fd33, %fd169, 0d0000000000000000, %p4;
@%p32 bra $L__BB0_34;
- setp.ge.u32 %p56, %r18, %r16;
+ setp.ge.u32 %p56, %r17, %r15;
mov.f64 %fd374, 0d0000000000000000;
@%p56 bra $L__BB0_33;
ld.shared.f64 %fd374, [%rd10];
@@ -600,54 +600,53 @@
sub.f64 %fd208, %fd205, %fd38;
mul.f64 %fd209, %fd39, %fd207;
sub.f64 %fd210, %fd208, %fd209;
mul.f64 %fd211, %fd183, %fd210;
mov.b64 %rd81, %fd211;
- mad.lo.s32 %r309, %r601, %r4, %r11;
- mad.lo.s32 %r310, %r309, %r108, %r15;
- mul.wide.s32 %rd82, %r310, 8;
+ mad.lo.s32 %r309, %r21, %r107, %r7;
+ mul.wide.s32 %rd82, %r309, 8;
add.s64 %rd79, %rd38, %rd82;
mov.b64 {%r305, %r306}, %rd80;
mov.b64 {%r307, %r308}, %rd81;
st.global.cs.v4.s32 [%rd79], {%r305,%r306,%r307,%r308};
$L__BB0_40:
- add.s32 %r601, %r601, 1;
- setp.lt.s32 %p71, %r601, %r10;
+ add.s32 %r600, %r600, 1;
+ setp.lt.s32 %p71, %r600, %r10;
@%p71 bra $L__BB0_5;
bra.uni $L__BB0_41;
$L__BB0_3:
mov.f64 %fd368, 0d0000000000000000;
mov.f64 %fd369, %fd368;
mov.f64 %fd378, %fd368;
mov.f64 %fd379, %fd368;
$L__BB0_41:
- mov.u32 %r311, %tid.z;
- mad.lo.s32 %r24, %r311, %r4, %r8;
- mad.lo.s32 %r25, %r24, %r3, %r6;
- mul.wide.u32 %rd83, %r25, 8;
+ mov.u32 %r310, %tid.z;
+ mad.lo.s32 %r23, %r310, %r4, %r8;
+ mad.lo.s32 %r24, %r23, %r3, %r6;
+ mul.wide.u32 %rd83, %r24, 8;
add.s64 %rd22, %rd44, %rd83;
- clz.b32 %r312, %r4;
- mov.u32 %r313, 31;
- sub.s32 %r314, %r313, %r312;
- mov.u32 %r315, 1;
- shl.b32 %r26, %r315, %r314;
- setp.lt.u32 %p72, %r8, %r26;
- add.s32 %r316, %r26, %r8;
- setp.lt.u32 %p73, %r316, %r4;
+ clz.b32 %r311, %r4;
+ mov.u32 %r312, 31;
+ sub.s32 %r313, %r312, %r311;
+ mov.u32 %r314, 1;
+ shl.b32 %r25, %r314, %r313;
+ setp.lt.u32 %p72, %r8, %r25;
+ add.s32 %r315, %r25, %r8;
+ setp.lt.u32 %p73, %r315, %r4;
and.pred %p5, %p72, %p73;
- shl.b32 %r317, %r3, %r314;
- add.s32 %r318, %r25, %r317;
- mul.wide.s32 %rd85, %r318, 8;
+ shl.b32 %r316, %r3, %r313;
+ add.s32 %r317, %r24, %r316;
+ mul.wide.s32 %rd85, %r317, 8;
add.s64 %rd23, %rd44, %rd85;
- shr.u32 %r319, %r26, 31;
- add.s32 %r320, %r26, %r319;
- shr.s32 %r605, %r320, 1;
+ shr.u32 %r318, %r25, 31;
+ add.s32 %r319, %r25, %r318;
+ shr.s32 %r604, %r319, 1;
st.shared.f64 [%rd22], %fd368;
bar.sync 0;
not.pred %p74, %p5;
@%p74 bra $L__BB0_43;
@@ -655,38 +654,38 @@
ld.shared.f64 %fd213, [%rd22];
add.f64 %fd214, %fd212, %fd213;
st.shared.f64 [%rd22], %fd214;
$L__BB0_43:
- setp.lt.s32 %p75, %r26, 4;
+ setp.lt.s32 %p75, %r25, 4;
bar.sync 0;
@%p75 bra $L__BB0_48;
- mov.u32 %r602, %r605;
+ mov.u32 %r601, %r604;
$L__BB0_45:
- setp.ge.u32 %p76, %r8, %r602;
+ setp.ge.u32 %p76, %r8, %r601;
@%p76 bra $L__BB0_47;
- mad.lo.s32 %r321, %r602, %r3, %r25;
- mul.wide.s32 %rd86, %r321, 8;
+ mad.lo.s32 %r320, %r601, %r3, %r24;
+ mul.wide.s32 %rd86, %r320, 8;
add.s64 %rd88, %rd44, %rd86;
ld.shared.f64 %fd215, [%rd22];
ld.shared.f64 %fd216, [%rd88];
add.f64 %fd217, %fd216, %fd215;
st.shared.f64 [%rd22], %fd217;
$L__BB0_47:
bar.sync 0;
- shr.u32 %r29, %r602, 1;
- setp.gt.u32 %p77, %r602, 3;
- mov.u32 %r602, %r29;
+ shr.u32 %r28, %r601, 1;
+ setp.gt.u32 %p77, %r601, 3;
+ mov.u32 %r601, %r28;
@%p77 bra $L__BB0_45;
$L__BB0_48:
- add.s32 %r322, %r25, %r3;
- mul.wide.u32 %rd89, %r322, 8;
+ add.s32 %r321, %r24, %r3;
+ mul.wide.u32 %rd89, %r321, 8;
add.s64 %rd24, %rd44, %rd89;
setp.ne.s32 %p78, %r8, 0;
mov.f64 %fd380, 0d0000000000000000;
@%p78 bra $L__BB0_51;
@@ -711,29 +710,29 @@
$L__BB0_53:
bar.sync 0;
@%p75 bra $L__BB0_58;
- mov.u32 %r603, %r605;
+ mov.u32 %r602, %r604;
$L__BB0_55:
- setp.ge.u32 %p82, %r8, %r603;
+ setp.ge.u32 %p82, %r8, %r602;
@%p82 bra $L__BB0_57;
- mad.lo.s32 %r323, %r603, %r3, %r25;
- mul.wide.s32 %rd91, %r323, 8;
+ mad.lo.s32 %r322, %r602, %r3, %r24;
+ mul.wide.s32 %rd91, %r322, 8;
add.s64 %rd93, %rd44, %rd91;
ld.shared.f64 %fd224, [%rd22];
ld.shared.f64 %fd225, [%rd93];
add.f64 %fd226, %fd225, %fd224;
st.shared.f64 [%rd22], %fd226;
$L__BB0_57:
bar.sync 0;
- shr.u32 %r31, %r603, 1;
- setp.gt.u32 %p83, %r603, 3;
- mov.u32 %r603, %r31;
+ shr.u32 %r30, %r602, 1;
+ setp.gt.u32 %p83, %r602, 3;
+ mov.u32 %r602, %r30;
@%p83 bra $L__BB0_55;
$L__BB0_58:
mov.f64 %fd381, 0d0000000000000000;
@%p78 bra $L__BB0_61;
@@ -759,29 +758,29 @@
$L__BB0_63:
bar.sync 0;
@%p75 bra $L__BB0_68;
- mov.u32 %r604, %r605;
+ mov.u32 %r603, %r604;
$L__BB0_65:
- setp.ge.u32 %p88, %r8, %r604;
+ setp.ge.u32 %p88, %r8, %r603;
@%p88 bra $L__BB0_67;
- mad.lo.s32 %r324, %r604, %r3, %r25;
- mul.wide.s32 %rd94, %r324, 8;
+ mad.lo.s32 %r323, %r603, %r3, %r24;
+ mul.wide.s32 %rd94, %r323, 8;
add.s64 %rd96, %rd44, %rd94;
ld.shared.f64 %fd233, [%rd22];
ld.shared.f64 %fd234, [%rd96];
add.f64 %fd235, %fd234, %fd233;
st.shared.f64 [%rd22], %fd235;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r33, %r604, 1;
- setp.gt.u32 %p89, %r604, 3;
- mov.u32 %r604, %r33;
+ shr.u32 %r32, %r603, 1;
+ setp.gt.u32 %p89, %r603, 3;
+ mov.u32 %r603, %r32;
@%p89 bra $L__BB0_65;
$L__BB0_68:
mov.f64 %fd382, 0d0000000000000000;
@%p78 bra $L__BB0_71;
@@ -808,26 +807,26 @@
$L__BB0_73:
bar.sync 0;
@%p75 bra $L__BB0_77;
$L__BB0_74:
- setp.ge.u32 %p94, %r8, %r605;
+ setp.ge.u32 %p94, %r8, %r604;
@%p94 bra $L__BB0_76;
- mad.lo.s32 %r325, %r605, %r3, %r25;
- mul.wide.s32 %rd97, %r325, 8;
+ mad.lo.s32 %r324, %r604, %r3, %r24;
+ mul.wide.s32 %rd97, %r324, 8;
add.s64 %rd99, %rd44, %rd97;
ld.shared.f64 %fd242, [%rd22];
ld.shared.f64 %fd243, [%rd99];
add.f64 %fd244, %fd243, %fd242;
st.shared.f64 [%rd22], %fd244;
$L__BB0_76:
bar.sync 0;
- shr.u32 %r35, %r605, 1;
- setp.gt.u32 %p95, %r605, 3;
- mov.u32 %r605, %r35;
+ shr.u32 %r34, %r604, 1;
+ setp.gt.u32 %p95, %r604, 3;
+ mov.u32 %r604, %r34;
@%p95 bra $L__BB0_74;
$L__BB0_77:
mov.f64 %fd383, 0d0000000000000000;
@%p78 bra $L__BB0_80;
@@ -844,328 +843,328 @@
bar.sync 0;
@%p2 bra $L__BB0_81;
bra.uni $L__BB0_82;
$L__BB0_81:
- mov.u32 %r334, %ctaid.y;
- mad.lo.s32 %r335, %r108, %r334, %r7;
- mul.wide.s32 %rd102, %r335, 8;
+ mov.u32 %r333, %ctaid.y;
+ mad.lo.s32 %r334, %r107, %r333, %r7;
+ mul.wide.s32 %rd102, %r334, 8;
add.s64 %rd100, %rd41, %rd102;
mov.b64 %rd103, %fd380;
- mov.b64 {%r326, %r327}, %rd103;
+ mov.b64 {%r325, %r326}, %rd103;
mov.b64 %rd104, %fd381;
- mov.b64 {%r328, %r329}, %rd104;
-
- st.volatile.global.v4.s32 [%rd100], {%r326,%r327,%r328,%r329};
+ mov.b64 {%r327, %r328}, %rd104;
+
+ st.volatile.global.v4.s32 [%rd100], {%r325,%r326,%r327,%r328};
add.s64 %rd101, %rd42, %rd102;
mov.b64 %rd105, %fd382;
- mov.b64 {%r330, %r331}, %rd105;
+ mov.b64 {%r329, %r330}, %rd105;
mov.b64 %rd106, %fd383;
- mov.b64 {%r332, %r333}, %rd106;
-
- st.volatile.global.v4.s32 [%rd101], {%r330,%r331,%r332,%r333};
+ mov.b64 {%r331, %r332}, %rd106;
+
+ st.volatile.global.v4.s32 [%rd101], {%r329,%r330,%r331,%r332};
$L__BB0_82:
- mov.u32 %r36, %ctaid.y;
+ mov.u32 %r35, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r336, %r6, %r8;
- or.b32 %r338, %r336, %r311;
- setp.ne.s32 %p98, %r338, 0;
+ or.b32 %r335, %r6, %r8;
+ or.b32 %r337, %r335, %r310;
+ setp.ne.s32 %p98, %r337, 0;
@%p98 bra $L__BB0_86;
cvta.to.global.u64 %rd107, %rd43;
- mov.u32 %r339, %ctaid.x;
- mov.u32 %r340, %ctaid.z;
- mov.u32 %r341, %nctaid.x;
- mad.lo.s32 %r342, %r340, %r341, %r339;
- mul.wide.s32 %rd108, %r342, 8;
+ mov.u32 %r338, %ctaid.x;
+ mov.u32 %r339, %ctaid.z;
+ mov.u32 %r340, %nctaid.x;
+ mad.lo.s32 %r341, %r339, %r340, %r338;
+ mul.wide.s32 %rd108, %r341, 8;
add.s64 %rd27, %rd107, %rd108;
- add.s32 %r343, %r9, -1;
- setp.eq.s32 %p99, %r36, %r343;
+ add.s32 %r342, %r9, -1;
+ setp.eq.s32 %p99, %r35, %r342;
cvt.s64.s32 %rd109, %r9;
mov.u64 %rd110, -9223372036854775807;
sub.s64 %rd111, %rd110, %rd109;
selp.b64 %rd112, %rd111, 1, %p99;
atom.global.add.u64 %rd28, [%rd27], %rd112;
ld.volatile.global.u64 %rd113, [%rd27];
xor.b64 %rd114, %rd113, %rd28;
setp.lt.s64 %p100, %rd114, 0;
@%p100 bra $L__BB0_86;
- mov.u32 %r606, 8;
+ mov.u32 %r605, 8;
$L__BB0_85:
- nanosleep.u32 %r606;
-
- setp.lt.u32 %p101, %r606, 256;
- selp.u32 %r346, 1, 0, %p101;
- shl.b32 %r606, %r606, %r346;
+ nanosleep.u32 %r605;
+
+ setp.lt.u32 %p101, %r605, 256;
+ selp.u32 %r345, 1, 0, %p101;
+ shl.b32 %r605, %r605, %r345;
ld.volatile.global.u64 %rd115, [%rd27];
xor.b64 %rd116, %rd115, %rd28;
setp.gt.s64 %p102, %rd116, -1;
@%p102 bra $L__BB0_85;
$L__BB0_86:
bar.sync 0;
- add.s32 %r347, %r4, %r2;
- add.s32 %r348, %r347, -1;
- div.s32 %r349, %r348, %r4;
- add.s32 %r350, %r9, %r349;
- add.s32 %r351, %r350, -1;
- div.s32 %r39, %r351, %r9;
- setp.lt.s32 %p103, %r39, 1;
+ add.s32 %r346, %r4, %r2;
+ add.s32 %r347, %r346, -1;
+ div.s32 %r348, %r347, %r4;
+ add.s32 %r349, %r9, %r348;
+ add.s32 %r350, %r349, -1;
+ div.s32 %r38, %r350, %r9;
+ setp.lt.s32 %p103, %r38, 1;
@%p103 bra $L__BB0_133;
- add.s32 %r353, %r9, %r3;
- add.s32 %r354, %r353, -1;
- shl.b32 %r40, %r8, 1;
- shl.b32 %r355, %r4, 1;
- mad.lo.s32 %r43, %r355, %r36, %r40;
- or.b32 %r41, %r43, 1;
- mul.lo.s32 %r42, %r355, %r9;
- shr.u32 %r44, %r3, 5;
- mul.lo.s32 %r356, %r24, %r44;
- shr.u32 %r45, %r6, 5;
- add.s32 %r357, %r356, %r45;
- mul.wide.u32 %rd117, %r357, 8;
+ add.s32 %r352, %r9, %r3;
+ add.s32 %r353, %r352, -1;
+ shl.b32 %r39, %r8, 1;
+ shl.b32 %r354, %r4, 1;
+ mad.lo.s32 %r42, %r354, %r35, %r39;
+ or.b32 %r40, %r42, 1;
+ mul.lo.s32 %r41, %r354, %r9;
+ shr.u32 %r43, %r3, 5;
+ mul.lo.s32 %r355, %r23, %r43;
+ shr.u32 %r44, %r6, 5;
+ add.s32 %r356, %r355, %r44;
+ mul.wide.u32 %rd117, %r356, 8;
add.s64 %rd29, %rd44, %rd117;
- and.b32 %r46, %r6, 31;
- add.s32 %r358, %r356, %r46;
- mul.wide.u32 %rd119, %r358, 8;
+ and.b32 %r45, %r6, 31;
+ add.s32 %r357, %r355, %r45;
+ mul.wide.u32 %rd119, %r357, 8;
add.s64 %rd30, %rd44, %rd119;
- div.s32 %r47, %r354, %r3;
- mov.u32 %r607, 0;
+ div.s32 %r46, %r353, %r3;
+ mov.u32 %r606, 0;
$L__BB0_88:
.pragma "nounroll";
- setp.lt.s32 %p104, %r47, 1;
+ setp.lt.s32 %p104, %r46, 1;
mov.f64 %fd388, 0d0000000000000000;
mov.f64 %fd389, %fd388;
@%p104 bra $L__BB0_94;
- mul.lo.s32 %r360, %r42, %r607;
- add.s32 %r49, %r41, %r360;
- add.s32 %r50, %r43, %r360;
- mov.u32 %r608, 0;
+ mul.lo.s32 %r359, %r41, %r606;
+ add.s32 %r48, %r40, %r359;
+ add.s32 %r49, %r42, %r359;
+ mov.u32 %r607, 0;
mov.f64 %fd251, 0d0000000000000000;
mov.f64 %fd388, %fd251;
mov.f64 %fd389, %fd251;
$L__BB0_90:
.pragma "nounroll";
- setp.ge.s32 %p105, %r49, %r108;
+ setp.ge.s32 %p105, %r48, %r107;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p105 bra $L__BB0_93;
- mad.lo.s32 %r52, %r608, %r3, %r6;
- setp.ge.s32 %p106, %r52, %r9;
+ mad.lo.s32 %r51, %r607, %r3, %r6;
+ setp.ge.s32 %p106, %r51, %r9;
mov.f64 %fd386, %fd251;
mov.f64 %fd387, %fd251;
@%p106 bra $L__BB0_93;
- mad.lo.s32 %r365, %r52, %r108, %r50;
- mul.wide.s32 %rd121, %r365, 8;
+ mad.lo.s32 %r364, %r51, %r107, %r49;
+ mul.wide.s32 %rd121, %r364, 8;
add.s64 %rd120, %rd42, %rd121;
- ld.volatile.global.v4.s32 {%r361,%r362,%r363,%r364}, [%rd120];
-
- mov.b64 %rd122, {%r361, %r362};
+ ld.volatile.global.v4.s32 {%r360,%r361,%r362,%r363}, [%rd120];
+
+ mov.b64 %rd122, {%r360, %r361};
mov.b64 %fd387, %rd122;
- mov.b64 %rd123, {%r363, %r364};
+ mov.b64 %rd123, {%r362, %r363};
mov.b64 %fd386, %rd123;
$L__BB0_93:
add.f64 %fd389, %fd389, %fd387;
add.f64 %fd388, %fd388, %fd386;
- add.s32 %r608, %r608, 1;
- setp.lt.s32 %p107, %r608, %r47;
+ add.s32 %r607, %r607, 1;
+ setp.lt.s32 %p107, %r607, %r46;
@%p107 bra $L__BB0_90;
$L__BB0_94:
- mov.b64 {%r366,%r367}, %fd389;
-
- mov.u32 %r386, 31;
- mov.u32 %r387, 16;
- mov.u32 %r388, -1;
- shfl.sync.bfly.b32 %r369|%p108, %r367, %r387, %r386, %r388;
- shfl.sync.bfly.b32 %r368|%p109, %r366, %r387, %r386, %r388;
-
- mov.b64 %fd257, {%r368,%r369};
+ mov.b64 {%r365,%r366}, %fd389;
+
+ mov.u32 %r385, 31;
+ mov.u32 %r386, 16;
+ mov.u32 %r387, -1;
+ shfl.sync.bfly.b32 %r368|%p108, %r366, %r386, %r385, %r387;
+ shfl.sync.bfly.b32 %r367|%p109, %r365, %r386, %r385, %r387;
+
+ mov.b64 %fd257, {%r367,%r368};
add.f64 %fd258, %fd389, %fd257;
- mov.b64 {%r370,%r371}, %fd258;
-
- mov.u32 %r389, 8;
- shfl.sync.bfly.b32 %r373|%p110, %r371, %r389, %r386, %r388;
- shfl.sync.bfly.b32 %r372|%p111, %r370, %r389, %r386, %r388;
-
- mov.b64 %fd259, {%r372,%r373};
+ mov.b64 {%r369,%r370}, %fd258;
+
+ mov.u32 %r388, 8;
+ shfl.sync.bfly.b32 %r372|%p110, %r370, %r388, %r385, %r387;
+ shfl.sync.bfly.b32 %r371|%p111, %r369, %r388, %r385, %r387;
+
+ mov.b64 %fd259, {%r371,%r372};
add.f64 %fd260, %fd258, %fd259;
- mov.b64 {%r374,%r375}, %fd260;
-
- mov.u32 %r390, 4;
- shfl.sync.bfly.b32 %r377|%p112, %r375, %r390, %r386, %r388;
- shfl.sync.bfly.b32 %r376|%p113, %r374, %r390, %r386, %r388;
-
- mov.b64 %fd261, {%r376,%r377};
+ mov.b64 {%r373,%r374}, %fd260;
+
+ mov.u32 %r389, 4;
+ shfl.sync.bfly.b32 %r376|%p112, %r374, %r389, %r385, %r387;
+ shfl.sync.bfly.b32 %r375|%p113, %r373, %r389, %r385, %r387;
+
+ mov.b64 %fd261, {%r375,%r376};
add.f64 %fd262, %fd260, %fd261;
- mov.b64 {%r378,%r379}, %fd262;
-
- mov.u32 %r391, 2;
- shfl.sync.bfly.b32 %r381|%p114, %r379, %r391, %r386, %r388;
- shfl.sync.bfly.b32 %r380|%p115, %r378, %r391, %r386, %r388;
-
- mov.b64 %fd263, {%r380,%r381};
+ mov.b64 {%r377,%r378}, %fd262;
+
+ mov.u32 %r390, 2;
+ shfl.sync.bfly.b32 %r380|%p114, %r378, %r390, %r385, %r387;
+ shfl.sync.bfly.b32 %r379|%p115, %r377, %r390, %r385, %r387;
+
+ mov.b64 %fd263, {%r379,%r380};
add.f64 %fd264, %fd262, %fd263;
- mov.b64 {%r382,%r383}, %fd264;
-
- mov.u32 %r392, 1;
- shfl.sync.bfly.b32 %r385|%p116, %r383, %r392, %r386, %r388;
- shfl.sync.bfly.b32 %r384|%p117, %r382, %r392, %r386, %r388;
-
- mov.b64 %fd265, {%r384,%r385};
+ mov.b64 {%r381,%r382}, %fd264;
+
+ mov.u32 %r391, 1;
+ shfl.sync.bfly.b32 %r384|%p116, %r382, %r391, %r385, %r387;
+ shfl.sync.bfly.b32 %r383|%p117, %r381, %r391, %r385, %r387;
+
+ mov.b64 %fd265, {%r383,%r384};
add.f64 %fd391, %fd264, %fd265;
bar.sync 0;
- setp.ne.s32 %p118, %r46, 0;
+ setp.ne.s32 %p118, %r45, 0;
@%p118 bra $L__BB0_96;
st.shared.f64 [%rd29], %fd391;
$L__BB0_96:
- setp.ne.s32 %p119, %r45, 0;
+ setp.ne.s32 %p119, %r44, 0;
bar.sync 0;
@%p119 bra $L__BB0_100;
- setp.ge.u32 %p120, %r46, %r44;
+ setp.ge.u32 %p120, %r45, %r43;
mov.f64 %fd390, 0d0000000000000000;
@%p120 bra $L__BB0_99;
ld.shared.f64 %fd390, [%rd30];
$L__BB0_99:
- mov.b64 {%r393,%r394}, %fd390;
-
- mov.u32 %r413, 31;
- mov.u32 %r414, 16;
- mov.u32 %r415, -1;
- shfl.sync.bfly.b32 %r396|%p121, %r394, %r414, %r413, %r415;
- shfl.sync.bfly.b32 %r395|%p122, %r393, %r414, %r413, %r415;
-
- mov.b64 %fd268, {%r395,%r396};
+ mov.b64 {%r392,%r393}, %fd390;
+
+ mov.u32 %r412, 31;
+ mov.u32 %r413, 16;
+ mov.u32 %r414, -1;
+ shfl.sync.bfly.b32 %r395|%p121, %r393, %r413, %r412, %r414;
+ shfl.sync.bfly.b32 %r394|%p122, %r392, %r413, %r412, %r414;
+
+ mov.b64 %fd268, {%r394,%r395};
add.f64 %fd269, %fd390, %fd268;
- mov.b64 {%r397,%r398}, %fd269;
-
- mov.u32 %r416, 8;
- shfl.sync.bfly.b32 %r400|%p123, %r398, %r416, %r413, %r415;
- shfl.sync.bfly.b32 %r399|%p124, %r397, %r416, %r413, %r415;
-
- mov.b64 %fd270, {%r399,%r400};
+ mov.b64 {%r396,%r397}, %fd269;
+
+ mov.u32 %r415, 8;
+ shfl.sync.bfly.b32 %r399|%p123, %r397, %r415, %r412, %r414;
+ shfl.sync.bfly.b32 %r398|%p124, %r396, %r415, %r412, %r414;
+
+ mov.b64 %fd270, {%r398,%r399};
add.f64 %fd271, %fd269, %fd270;
- mov.b64 {%r401,%r402}, %fd271;
-
- mov.u32 %r417, 4;
- shfl.sync.bfly.b32 %r404|%p125, %r402, %r417, %r413, %r415;
- shfl.sync.bfly.b32 %r403|%p126, %r401, %r417, %r413, %r415;
-
- mov.b64 %fd272, {%r403,%r404};
+ mov.b64 {%r400,%r401}, %fd271;
+
+ mov.u32 %r416, 4;
+ shfl.sync.bfly.b32 %r403|%p125, %r401, %r416, %r412, %r414;
+ shfl.sync.bfly.b32 %r402|%p126, %r400, %r416, %r412, %r414;
+
+ mov.b64 %fd272, {%r402,%r403};
add.f64 %fd273, %fd271, %fd272;
- mov.b64 {%r405,%r406}, %fd273;
-
- mov.u32 %r418, 2;
- shfl.sync.bfly.b32 %r408|%p127, %r406, %r418, %r413, %r415;
- shfl.sync.bfly.b32 %r407|%p128, %r405, %r418, %r413, %r415;
-
- mov.b64 %fd274, {%r407,%r408};
+ mov.b64 {%r404,%r405}, %fd273;
+
+ mov.u32 %r417, 2;
+ shfl.sync.bfly.b32 %r407|%p127, %r405, %r417, %r412, %r414;
+ shfl.sync.bfly.b32 %r406|%p128, %r404, %r417, %r412, %r414;
+
+ mov.b64 %fd274, {%r406,%r407};
add.f64 %fd275, %fd273, %fd274;
- mov.b64 {%r409,%r410}, %fd275;
-
- mov.u32 %r419, 1;
- shfl.sync.bfly.b32 %r412|%p129, %r410, %r419, %r413, %r415;
- shfl.sync.bfly.b32 %r411|%p130, %r409, %r419, %r413, %r415;
-
- mov.b64 %fd276, {%r411,%r412};
+ mov.b64 {%r408,%r409}, %fd275;
+
+ mov.u32 %r418, 1;
+ shfl.sync.bfly.b32 %r411|%p129, %r409, %r418, %r412, %r414;
+ shfl.sync.bfly.b32 %r410|%p130, %r408, %r418, %r412, %r414;
+
+ mov.b64 %fd276, {%r410,%r411};
add.f64 %fd391, %fd275, %fd276;
$L__BB0_100:
add.f64 %fd287, %fd391, 0d0000000000000000;
- setp.eq.s32 %p132, %r46, 0;
+ setp.eq.s32 %p132, %r45, 0;
selp.f64 %fd71, %fd287, 0d0000000000000000, %p132;
bar.sync 0;
- mov.b64 {%r420,%r421}, %fd388;
-
- mov.u32 %r440, 31;
- mov.u32 %r441, 16;
- mov.u32 %r442, -1;
- shfl.sync.bfly.b32 %r423|%p133, %r421, %r441, %r440, %r442;
- shfl.sync.bfly.b32 %r422|%p134, %r420, %r441, %r440, %r442;
-
- mov.b64 %fd278, {%r422,%r423};
+ mov.b64 {%r419,%r420}, %fd388;
+
+ mov.u32 %r439, 31;
+ mov.u32 %r440, 16;
+ mov.u32 %r441, -1;
+ shfl.sync.bfly.b32 %r422|%p133, %r420, %r440, %r439, %r441;
+ shfl.sync.bfly.b32 %r421|%p134, %r419, %r440, %r439, %r441;
+
+ mov.b64 %fd278, {%r421,%r422};
add.f64 %fd279, %fd388, %fd278;
- mov.b64 {%r424,%r425}, %fd279;
-
- mov.u32 %r443, 8;
- shfl.sync.bfly.b32 %r427|%p135, %r425, %r443, %r440, %r442;
- shfl.sync.bfly.b32 %r426|%p136, %r424, %r443, %r440, %r442;
-
- mov.b64 %fd280, {%r426,%r427};
+ mov.b64 {%r423,%r424}, %fd279;
+
+ mov.u32 %r442, 8;
+ shfl.sync.bfly.b32 %r426|%p135, %r424, %r442, %r439, %r441;
+ shfl.sync.bfly.b32 %r425|%p136, %r423, %r442, %r439, %r441;
+
+ mov.b64 %fd280, {%r425,%r426};
add.f64 %fd281, %fd279, %fd280;
- mov.b64 {%r428,%r429}, %fd281;
-
- mov.u32 %r444, 4;
- shfl.sync.bfly.b32 %r431|%p137, %r429, %r444, %r440, %r442;
- shfl.sync.bfly.b32 %r430|%p138, %r428, %r444, %r440, %r442;
-
- mov.b64 %fd282, {%r430,%r431};
+ mov.b64 {%r427,%r428}, %fd281;
+
+ mov.u32 %r443, 4;
+ shfl.sync.bfly.b32 %r430|%p137, %r428, %r443, %r439, %r441;
+ shfl.sync.bfly.b32 %r429|%p138, %r427, %r443, %r439, %r441;
+
+ mov.b64 %fd282, {%r429,%r430};
add.f64 %fd283, %fd281, %fd282;
- mov.b64 {%r432,%r433}, %fd283;
-
- mov.u32 %r445, 2;
- shfl.sync.bfly.b32 %r435|%p139, %r433, %r445, %r440, %r442;
- shfl.sync.bfly.b32 %r434|%p140, %r432, %r445, %r440, %r442;
-
- mov.b64 %fd284, {%r434,%r435};
+ mov.b64 {%r431,%r432}, %fd283;
+
+ mov.u32 %r444, 2;
+ shfl.sync.bfly.b32 %r434|%p139, %r432, %r444, %r439, %r441;
+ shfl.sync.bfly.b32 %r433|%p140, %r431, %r444, %r439, %r441;
+
+ mov.b64 %fd284, {%r433,%r434};
add.f64 %fd285, %fd283, %fd284;
- mov.b64 {%r436,%r437}, %fd285;
-
- mov.u32 %r446, 1;
- shfl.sync.bfly.b32 %r439|%p141, %r437, %r446, %r440, %r442;
- shfl.sync.bfly.b32 %r438|%p142, %r436, %r446, %r440, %r442;
-
- mov.b64 %fd286, {%r438,%r439};
+ mov.b64 {%r435,%r436}, %fd285;
+
+ mov.u32 %r445, 1;
+ shfl.sync.bfly.b32 %r438|%p141, %r436, %r445, %r439, %r441;
+ shfl.sync.bfly.b32 %r437|%p142, %r435, %r445, %r439, %r441;
+
+ mov.b64 %fd286, {%r437,%r438};
add.f64 %fd393, %fd285, %fd286;
bar.sync 0;
@%p118 bra $L__BB0_102;
@@ -1173,201 +1172,201 @@
$L__BB0_102:
bar.sync 0;
@%p119 bra $L__BB0_106;
- setp.ge.u32 %p144, %r46, %r44;
+ setp.ge.u32 %p144, %r45, %r43;
mov.f64 %fd392, 0d0000000000000000;
@%p144 bra $L__BB0_105;
ld.shared.f64 %fd392, [%rd30];
$L__BB0_105:
- mov.b64 {%r447,%r448}, %fd392;
-
- mov.u32 %r467, 31;
- mov.u32 %r468, 16;
- mov.u32 %r469, -1;
- shfl.sync.bfly.b32 %r450|%p145, %r448, %r468, %r467, %r469;
- shfl.sync.bfly.b32 %r449|%p146, %r447, %r468, %r467, %r469;
-
- mov.b64 %fd290, {%r449,%r450};
+ mov.b64 {%r446,%r447}, %fd392;
+
+ mov.u32 %r466, 31;
+ mov.u32 %r467, 16;
+ mov.u32 %r468, -1;
+ shfl.sync.bfly.b32 %r449|%p145, %r447, %r467, %r466, %r468;
+ shfl.sync.bfly.b32 %r448|%p146, %r446, %r467, %r466, %r468;
+
+ mov.b64 %fd290, {%r448,%r449};
add.f64 %fd291, %fd392, %fd290;
- mov.b64 {%r451,%r452}, %fd291;
-
- mov.u32 %r470, 8;
- shfl.sync.bfly.b32 %r454|%p147, %r452, %r470, %r467, %r469;
- shfl.sync.bfly.b32 %r453|%p148, %r451, %r470, %r467, %r469;
-
- mov.b64 %fd292, {%r453,%r454};
+ mov.b64 {%r450,%r451}, %fd291;
+
+ mov.u32 %r469, 8;
+ shfl.sync.bfly.b32 %r453|%p147, %r451, %r469, %r466, %r468;
+ shfl.sync.bfly.b32 %r452|%p148, %r450, %r469, %r466, %r468;
+
+ mov.b64 %fd292, {%r452,%r453};
add.f64 %fd293, %fd291, %fd292;
- mov.b64 {%r455,%r456}, %fd293;
-
- mov.u32 %r471, 4;
- shfl.sync.bfly.b32 %r458|%p149, %r456, %r471, %r467, %r469;
- shfl.sync.bfly.b32 %r457|%p150, %r455, %r471, %r467, %r469;
-
- mov.b64 %fd294, {%r457,%r458};
+ mov.b64 {%r454,%r455}, %fd293;
+
+ mov.u32 %r470, 4;
+ shfl.sync.bfly.b32 %r457|%p149, %r455, %r470, %r466, %r468;
+ shfl.sync.bfly.b32 %r456|%p150, %r454, %r470, %r466, %r468;
+
+ mov.b64 %fd294, {%r456,%r457};
add.f64 %fd295, %fd293, %fd294;
- mov.b64 {%r459,%r460}, %fd295;
-
- mov.u32 %r472, 2;
- shfl.sync.bfly.b32 %r462|%p151, %r460, %r472, %r467, %r469;
- shfl.sync.bfly.b32 %r461|%p152, %r459, %r472, %r467, %r469;
-
- mov.b64 %fd296, {%r461,%r462};
+ mov.b64 {%r458,%r459}, %fd295;
+
+ mov.u32 %r471, 2;
+ shfl.sync.bfly.b32 %r461|%p151, %r459, %r471, %r466, %r468;
+ shfl.sync.bfly.b32 %r460|%p152, %r458, %r471, %r466, %r468;
+
+ mov.b64 %fd296, {%r460,%r461};
add.f64 %fd297, %fd295, %fd296;
- mov.b64 {%r463,%r464}, %fd297;
-
- mov.u32 %r473, 1;
- shfl.sync.bfly.b32 %r466|%p153, %r464, %r473, %r467, %r469;
- shfl.sync.bfly.b32 %r465|%p154, %r463, %r473, %r467, %r469;
-
- mov.b64 %fd298, {%r465,%r466};
+ mov.b64 {%r462,%r463}, %fd297;
+
+ mov.u32 %r472, 1;
+ shfl.sync.bfly.b32 %r465|%p153, %r463, %r472, %r466, %r468;
+ shfl.sync.bfly.b32 %r464|%p154, %r462, %r472, %r466, %r468;
+
+ mov.b64 %fd298, {%r464,%r465};
add.f64 %fd393, %fd297, %fd298;
$L__BB0_106:
bar.sync 0;
setp.ne.s32 %p155, %r6, 0;
@%p155 bra $L__BB0_109;
- mul.lo.s32 %r54, %r42, %r607;
- add.s32 %r474, %r41, %r54;
- setp.ge.s32 %p156, %r474, %r108;
+ mul.lo.s32 %r53, %r41, %r606;
+ add.s32 %r473, %r40, %r53;
+ setp.ge.s32 %p156, %r473, %r107;
@%p156 bra $L__BB0_109;
- add.s32 %r479, %r43, %r54;
- mul.wide.s32 %rd125, %r479, 8;
+ add.s32 %r478, %r42, %r53;
+ mul.wide.s32 %rd125, %r478, 8;
add.s64 %rd124, %rd40, %rd125;
mov.b64 %rd126, %fd71;
- mov.b64 {%r475, %r476}, %rd126;
+ mov.b64 {%r474, %r475}, %rd126;
add.f64 %fd299, %fd393, 0d0000000000000000;
selp.f64 %fd300, %fd299, 0d0000000000000000, %p132;
mov.b64 %rd127, %fd300;
- mov.b64 {%r477, %r478}, %rd127;
-
- st.global.cs.v4.s32 [%rd124], {%r475,%r476,%r477,%r478};
+ mov.b64 {%r476, %r477}, %rd127;
+
+ st.global.cs.v4.s32 [%rd124], {%r474,%r475,%r476,%r477};
$L__BB0_109:
- add.s32 %r607, %r607, 1;
- setp.lt.s32 %p158, %r607, %r39;
+ add.s32 %r606, %r606, 1;
+ setp.lt.s32 %p158, %r606, %r38;
@%p158 bra $L__BB0_88;
- mad.lo.s32 %r56, %r108, %r6, %r40;
- shl.b32 %r57, %r36, 1;
- shl.b32 %r58, %r9, 1;
- mul.lo.s32 %r59, %r108, %r3;
- mov.u32 %r609, 0;
+ mad.lo.s32 %r55, %r107, %r6, %r39;
+ shl.b32 %r56, %r35, 1;
+ shl.b32 %r57, %r9, 1;
+ mul.lo.s32 %r58, %r107, %r3;
+ mov.u32 %r608, 0;
$L__BB0_111:
.pragma "nounroll";
mov.f64 %fd398, 0d0000000000000000;
mov.f64 %fd399, %fd398;
@%p104 bra $L__BB0_117;
- mad.lo.s32 %r61, %r42, %r609, %r41;
- mad.lo.s32 %r482, %r58, %r609, %r57;
- mad.lo.s32 %r611, %r4, %r482, %r56;
- mov.u32 %r612, 0;
+ mad.lo.s32 %r60, %r41, %r608, %r40;
+ mad.lo.s32 %r481, %r57, %r608, %r56;
+ mad.lo.s32 %r610, %r4, %r481, %r55;
+ mov.u32 %r611, 0;
mov.f64 %fd304, 0d0000000000000000;
- mov.u32 %r610, %r6;
+ mov.u32 %r609, %r6;
mov.f64 %fd398, %fd304;
mov.f64 %fd399, %fd304;
$L__BB0_113:
.pragma "nounroll";
- setp.ge.s32 %p160, %r61, %r108;
+ setp.ge.s32 %p160, %r60, %r107;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p160 bra $L__BB0_116;
- setp.ge.s32 %p161, %r610, %r9;
+ setp.ge.s32 %p161, %r609, %r9;
mov.f64 %fd396, %fd304;
mov.f64 %fd397, %fd304;
@%p161 bra $L__BB0_116;
- mul.wide.s32 %rd129, %r611, 8;
+ mul.wide.s32 %rd129, %r610, 8;
add.s64 %rd128, %rd41, %rd129;
- ld.volatile.global.v4.s32 {%r483,%r484,%r485,%r486}, [%rd128];
-
- mov.b64 %rd130, {%r483, %r484};
+ ld.volatile.global.v4.s32 {%r482,%r483,%r484,%r485}, [%rd128];
+
+ mov.b64 %rd130, {%r482, %r483};
mov.b64 %fd397, %rd130;
- mov.b64 %rd131, {%r485, %r486};
+ mov.b64 %rd131, {%r484, %r485};
mov.b64 %fd396, %rd131;
$L__BB0_116:
add.f64 %fd399, %fd399, %fd397;
add.f64 %fd398, %fd398, %fd396;
- add.s32 %r611, %r611, %r59;
- add.s32 %r610, %r610, %r3;
- add.s32 %r612, %r612, 1;
- setp.lt.s32 %p162, %r612, %r47;
+ add.s32 %r610, %r610, %r58;
+ add.s32 %r609, %r609, %r3;
+ add.s32 %r611, %r611, 1;
+ setp.lt.s32 %p162, %r611, %r46;
@%p162 bra $L__BB0_113;
$L__BB0_117:
- mov.b64 {%r487,%r488}, %fd399;
-
- mov.u32 %r507, 31;
- mov.u32 %r508, 16;
- mov.u32 %r509, -1;
- shfl.sync.bfly.b32 %r490|%p163, %r488, %r508, %r507, %r509;
- shfl.sync.bfly.b32 %r489|%p164, %r487, %r508, %r507, %r509;
-
- mov.b64 %fd310, {%r489,%r490};
+ mov.b64 {%r486,%r487}, %fd399;
+
+ mov.u32 %r506, 31;
+ mov.u32 %r507, 16;
+ mov.u32 %r508, -1;
+ shfl.sync.bfly.b32 %r489|%p163, %r487, %r507, %r506, %r508;
+ shfl.sync.bfly.b32 %r488|%p164, %r486, %r507, %r506, %r508;
+
+ mov.b64 %fd310, {%r488,%r489};
add.f64 %fd311, %fd399, %fd310;
- mov.b64 {%r491,%r492}, %fd311;
-
- mov.u32 %r510, 8;
- shfl.sync.bfly.b32 %r494|%p165, %r492, %r510, %r507, %r509;
- shfl.sync.bfly.b32 %r493|%p166, %r491, %r510, %r507, %r509;
-
- mov.b64 %fd312, {%r493,%r494};
+ mov.b64 {%r490,%r491}, %fd311;
+
+ mov.u32 %r509, 8;
+ shfl.sync.bfly.b32 %r493|%p165, %r491, %r509, %r506, %r508;
+ shfl.sync.bfly.b32 %r492|%p166, %r490, %r509, %r506, %r508;
+
+ mov.b64 %fd312, {%r492,%r493};
add.f64 %fd313, %fd311, %fd312;
- mov.b64 {%r495,%r496}, %fd313;
-
- mov.u32 %r511, 4;
- shfl.sync.bfly.b32 %r498|%p167, %r496, %r511, %r507, %r509;
- shfl.sync.bfly.b32 %r497|%p168, %r495, %r511, %r507, %r509;
-
- mov.b64 %fd314, {%r497,%r498};
+ mov.b64 {%r494,%r495}, %fd313;
+
+ mov.u32 %r510, 4;
+ shfl.sync.bfly.b32 %r497|%p167, %r495, %r510, %r506, %r508;
+ shfl.sync.bfly.b32 %r496|%p168, %r494, %r510, %r506, %r508;
+
+ mov.b64 %fd314, {%r496,%r497};
add.f64 %fd315, %fd313, %fd314;
- mov.b64 {%r499,%r500}, %fd315;
-
- mov.u32 %r512, 2;
- shfl.sync.bfly.b32 %r502|%p169, %r500, %r512, %r507, %r509;
- shfl.sync.bfly.b32 %r501|%p170, %r499, %r512, %r507, %r509;
-
- mov.b64 %fd316, {%r501,%r502};
+ mov.b64 {%r498,%r499}, %fd315;
+
+ mov.u32 %r511, 2;
+ shfl.sync.bfly.b32 %r501|%p169, %r499, %r511, %r506, %r508;
+ shfl.sync.bfly.b32 %r500|%p170, %r498, %r511, %r506, %r508;
+
+ mov.b64 %fd316, {%r500,%r501};
add.f64 %fd317, %fd315, %fd316;
- mov.b64 {%r503,%r504}, %fd317;
-
- mov.u32 %r513, 1;
- shfl.sync.bfly.b32 %r506|%p171, %r504, %r513, %r507, %r509;
- shfl.sync.bfly.b32 %r505|%p172, %r503, %r513, %r507, %r509;
-
- mov.b64 %fd318, {%r505,%r506};
+ mov.b64 {%r502,%r503}, %fd317;
+
+ mov.u32 %r512, 1;
+ shfl.sync.bfly.b32 %r505|%p171, %r503, %r512, %r506, %r508;
+ shfl.sync.bfly.b32 %r504|%p172, %r502, %r512, %r506, %r508;
+
+ mov.b64 %fd318, {%r504,%r505};
add.f64 %fd401, %fd317, %fd318;
bar.sync 0;
@%p118 bra $L__BB0_119;
@@ -1375,124 +1374,124 @@
$L__BB0_119:
bar.sync 0;
@%p119 bra $L__BB0_123;
- setp.ge.u32 %p175, %r46, %r44;
+ setp.ge.u32 %p175, %r45, %r43;
mov.f64 %fd400, 0d0000000000000000;
@%p175 bra $L__BB0_122;
ld.shared.f64 %fd400, [%rd30];
$L__BB0_122:
- mov.b64 {%r514,%r515}, %fd400;
-
- mov.u32 %r534, 31;
- mov.u32 %r535, 16;
- mov.u32 %r536, -1;
- shfl.sync.bfly.b32 %r517|%p176, %r515, %r535, %r534, %r536;
- shfl.sync.bfly.b32 %r516|%p177, %r514, %r535, %r534, %r536;
-
- mov.b64 %fd321, {%r516,%r517};
+ mov.b64 {%r513,%r514}, %fd400;
+
+ mov.u32 %r533, 31;
+ mov.u32 %r534, 16;
+ mov.u32 %r535, -1;
+ shfl.sync.bfly.b32 %r516|%p176, %r514, %r534, %r533, %r535;
+ shfl.sync.bfly.b32 %r515|%p177, %r513, %r534, %r533, %r535;
+
+ mov.b64 %fd321, {%r515,%r516};
add.f64 %fd322, %fd400, %fd321;
- mov.b64 {%r518,%r519}, %fd322;
-
- mov.u32 %r537, 8;
- shfl.sync.bfly.b32 %r521|%p178, %r519, %r537, %r534, %r536;
- shfl.sync.bfly.b32 %r520|%p179, %r518, %r537, %r534, %r536;
-
- mov.b64 %fd323, {%r520,%r521};
+ mov.b64 {%r517,%r518}, %fd322;
+
+ mov.u32 %r536, 8;
+ shfl.sync.bfly.b32 %r520|%p178, %r518, %r536, %r533, %r535;
+ shfl.sync.bfly.b32 %r519|%p179, %r517, %r536, %r533, %r535;
+
+ mov.b64 %fd323, {%r519,%r520};
add.f64 %fd324, %fd322, %fd323;
- mov.b64 {%r522,%r523}, %fd324;
-
- mov.u32 %r538, 4;
- shfl.sync.bfly.b32 %r525|%p180, %r523, %r538, %r534, %r536;
- shfl.sync.bfly.b32 %r524|%p181, %r522, %r538, %r534, %r536;
-
- mov.b64 %fd325, {%r524,%r525};
+ mov.b64 {%r521,%r522}, %fd324;
+
+ mov.u32 %r537, 4;
+ shfl.sync.bfly.b32 %r524|%p180, %r522, %r537, %r533, %r535;
+ shfl.sync.bfly.b32 %r523|%p181, %r521, %r537, %r533, %r535;
+
+ mov.b64 %fd325, {%r523,%r524};
add.f64 %fd326, %fd324, %fd325;
- mov.b64 {%r526,%r527}, %fd326;
-
- mov.u32 %r539, 2;
- shfl.sync.bfly.b32 %r529|%p182, %r527, %r539, %r534, %r536;
- shfl.sync.bfly.b32 %r528|%p183, %r526, %r539, %r534, %r536;
-
- mov.b64 %fd327, {%r528,%r529};
+ mov.b64 {%r525,%r526}, %fd326;
+
+ mov.u32 %r538, 2;
+ shfl.sync.bfly.b32 %r528|%p182, %r526, %r538, %r533, %r535;
+ shfl.sync.bfly.b32 %r527|%p183, %r525, %r538, %r533, %r535;
+
+ mov.b64 %fd327, {%r527,%r528};
add.f64 %fd328, %fd326, %fd327;
- mov.b64 {%r530,%r531}, %fd328;
-
- mov.u32 %r540, 1;
- shfl.sync.bfly.b32 %r533|%p184, %r531, %r540, %r534, %r536;
- shfl.sync.bfly.b32 %r532|%p185, %r530, %r540, %r534, %r536;
-
- mov.b64 %fd329, {%r532,%r533};
+ mov.b64 {%r529,%r530}, %fd328;
+
+ mov.u32 %r539, 1;
+ shfl.sync.bfly.b32 %r532|%p184, %r530, %r539, %r533, %r535;
+ shfl.sync.bfly.b32 %r531|%p185, %r529, %r539, %r533, %r535;
+
+ mov.b64 %fd329, {%r531,%r532};
add.f64 %fd401, %fd328, %fd329;
$L__BB0_123:
add.f64 %fd340, %fd401, 0d0000000000000000;
selp.f64 %fd92, %fd340, 0d0000000000000000, %p132;
bar.sync 0;
- mov.b64 {%r541,%r542}, %fd398;
-
- mov.u32 %r561, 31;
- mov.u32 %r562, 16;
- mov.u32 %r563, -1;
- shfl.sync.bfly.b32 %r544|%p188, %r542, %r562, %r561, %r563;
- shfl.sync.bfly.b32 %r543|%p189, %r541, %r562, %r561, %r563;
-
- mov.b64 %fd331, {%r543,%r544};
+ mov.b64 {%r540,%r541}, %fd398;
+
+ mov.u32 %r560, 31;
+ mov.u32 %r561, 16;
+ mov.u32 %r562, -1;
+ shfl.sync.bfly.b32 %r543|%p188, %r541, %r561, %r560, %r562;
+ shfl.sync.bfly.b32 %r542|%p189, %r540, %r561, %r560, %r562;
+
+ mov.b64 %fd331, {%r542,%r543};
add.f64 %fd332, %fd398, %fd331;
- mov.b64 {%r545,%r546}, %fd332;
-
- mov.u32 %r564, 8;
- shfl.sync.bfly.b32 %r548|%p190, %r546, %r564, %r561, %r563;
- shfl.sync.bfly.b32 %r547|%p191, %r545, %r564, %r561, %r563;
-
- mov.b64 %fd333, {%r547,%r548};
+ mov.b64 {%r544,%r545}, %fd332;
+
+ mov.u32 %r563, 8;
+ shfl.sync.bfly.b32 %r547|%p190, %r545, %r563, %r560, %r562;
+ shfl.sync.bfly.b32 %r546|%p191, %r544, %r563, %r560, %r562;
+
+ mov.b64 %fd333, {%r546,%r547};
add.f64 %fd334, %fd332, %fd333;
- mov.b64 {%r549,%r550}, %fd334;
-
- mov.u32 %r565, 4;
- shfl.sync.bfly.b32 %r552|%p192, %r550, %r565, %r561, %r563;
- shfl.sync.bfly.b32 %r551|%p193, %r549, %r565, %r561, %r563;
-
- mov.b64 %fd335, {%r551,%r552};
+ mov.b64 {%r548,%r549}, %fd334;
+
+ mov.u32 %r564, 4;
+ shfl.sync.bfly.b32 %r551|%p192, %r549, %r564, %r560, %r562;
+ shfl.sync.bfly.b32 %r550|%p193, %r548, %r564, %r560, %r562;
+
+ mov.b64 %fd335, {%r550,%r551};
add.f64 %fd336, %fd334, %fd335;
- mov.b64 {%r553,%r554}, %fd336;
-
- mov.u32 %r566, 2;
- shfl.sync.bfly.b32 %r556|%p194, %r554, %r566, %r561, %r563;
- shfl.sync.bfly.b32 %r555|%p195, %r553, %r566, %r561, %r563;
-
- mov.b64 %fd337, {%r555,%r556};
+ mov.b64 {%r552,%r553}, %fd336;
+
+ mov.u32 %r565, 2;
+ shfl.sync.bfly.b32 %r555|%p194, %r553, %r565, %r560, %r562;
+ shfl.sync.bfly.b32 %r554|%p195, %r552, %r565, %r560, %r562;
+
+ mov.b64 %fd337, {%r554,%r555};
add.f64 %fd338, %fd336, %fd337;
- mov.b64 {%r557,%r558}, %fd338;
-
- mov.u32 %r567, 1;
- shfl.sync.bfly.b32 %r560|%p196, %r558, %r567, %r561, %r563;
- shfl.sync.bfly.b32 %r559|%p197, %r557, %r567, %r561, %r563;
-
- mov.b64 %fd339, {%r559,%r560};
+ mov.b64 {%r556,%r557}, %fd338;
+
+ mov.u32 %r566, 1;
+ shfl.sync.bfly.b32 %r559|%p196, %r557, %r566, %r560, %r562;
+ shfl.sync.bfly.b32 %r558|%p197, %r556, %r566, %r560, %r562;
+
+ mov.b64 %fd339, {%r558,%r559};
add.f64 %fd403, %fd338, %fd339;
bar.sync 0;
@%p118 bra $L__BB0_125;
@@ -1500,95 +1499,95 @@
$L__BB0_125:
bar.sync 0;
@%p119 bra $L__BB0_129;
- setp.ge.u32 %p199, %r46, %r44;
+ setp.ge.u32 %p199, %r45, %r43;
mov.f64 %fd402, 0d0000000000000000;
@%p199 bra $L__BB0_128;
ld.shared.f64 %fd402, [%rd30];
$L__BB0_128:
- mov.b64 {%r568,%r569}, %fd402;
-
- mov.u32 %r588, 31;
- mov.u32 %r589, 16;
- mov.u32 %r590, -1;
- shfl.sync.bfly.b32 %r571|%p200, %r569, %r589, %r588, %r590;
- shfl.sync.bfly.b32 %r570|%p201, %r568, %r589, %r588, %r590;
-
- mov.b64 %fd343, {%r570,%r571};
+ mov.b64 {%r567,%r568}, %fd402;
+
+ mov.u32 %r587, 31;
+ mov.u32 %r588, 16;
+ mov.u32 %r589, -1;
+ shfl.sync.bfly.b32 %r570|%p200, %r568, %r588, %r587, %r589;
+ shfl.sync.bfly.b32 %r569|%p201, %r567, %r588, %r587, %r589;
+
+ mov.b64 %fd343, {%r569,%r570};
add.f64 %fd344, %fd402, %fd343;
- mov.b64 {%r572,%r573}, %fd344;
-
- mov.u32 %r591, 8;
- shfl.sync.bfly.b32 %r575|%p202, %r573, %r591, %r588, %r590;
- shfl.sync.bfly.b32 %r574|%p203, %r572, %r591, %r588, %r590;
-
- mov.b64 %fd345, {%r574,%r575};
+ mov.b64 {%r571,%r572}, %fd344;
+
+ mov.u32 %r590, 8;
+ shfl.sync.bfly.b32 %r574|%p202, %r572, %r590, %r587, %r589;
+ shfl.sync.bfly.b32 %r573|%p203, %r571, %r590, %r587, %r589;
+
+ mov.b64 %fd345, {%r573,%r574};
add.f64 %fd346, %fd344, %fd345;
- mov.b64 {%r576,%r577}, %fd346;
-
- mov.u32 %r592, 4;
- shfl.sync.bfly.b32 %r579|%p204, %r577, %r592, %r588, %r590;
- shfl.sync.bfly.b32 %r578|%p205, %r576, %r592, %r588, %r590;
-
- mov.b64 %fd347, {%r578,%r579};
+ mov.b64 {%r575,%r576}, %fd346;
+
+ mov.u32 %r591, 4;
+ shfl.sync.bfly.b32 %r578|%p204, %r576, %r591, %r587, %r589;
+ shfl.sync.bfly.b32 %r577|%p205, %r575, %r591, %r587, %r589;
+
+ mov.b64 %fd347, {%r577,%r578};
add.f64 %fd348, %fd346, %fd347;
- mov.b64 {%r580,%r581}, %fd348;
-
- mov.u32 %r593, 2;
- shfl.sync.bfly.b32 %r583|%p206, %r581, %r593, %r588, %r590;
- shfl.sync.bfly.b32 %r582|%p207, %r580, %r593, %r588, %r590;
-
- mov.b64 %fd349, {%r582,%r583};
+ mov.b64 {%r579,%r580}, %fd348;
+
+ mov.u32 %r592, 2;
+ shfl.sync.bfly.b32 %r582|%p206, %r580, %r592, %r587, %r589;
+ shfl.sync.bfly.b32 %r581|%p207, %r579, %r592, %r587, %r589;
+
+ mov.b64 %fd349, {%r581,%r582};
add.f64 %fd350, %fd348, %fd349;
- mov.b64 {%r584,%r585}, %fd350;
-
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r587|%p208, %r585, %r594, %r588, %r590;
- shfl.sync.bfly.b32 %r586|%p209, %r584, %r594, %r588, %r590;
-
- mov.b64 %fd351, {%r586,%r587};
+ mov.b64 {%r583,%r584}, %fd350;
+
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r586|%p208, %r584, %r593, %r587, %r589;
+ shfl.sync.bfly.b32 %r585|%p209, %r583, %r593, %r587, %r589;
+
+ mov.b64 %fd351, {%r585,%r586};
add.f64 %fd403, %fd350, %fd351;
$L__BB0_129:
bar.sync 0;
@%p155 bra $L__BB0_132;
- mul.lo.s32 %r69, %r42, %r609;
- add.s32 %r595, %r41, %r69;
- setp.ge.s32 %p211, %r595, %r108;
+ mul.lo.s32 %r68, %r41, %r608;
+ add.s32 %r594, %r40, %r68;
+ setp.ge.s32 %p211, %r594, %r107;
@%p211 bra $L__BB0_132;
- add.s32 %r600, %r43, %r69;
- mul.wide.s32 %rd133, %r600, 8;
+ add.s32 %r599, %r42, %r68;
+ mul.wide.s32 %rd133, %r599, 8;
add.s64 %rd132, %rd39, %rd133;
mov.b64 %rd134, %fd92;
- mov.b64 {%r596, %r597}, %rd134;
+ mov.b64 {%r595, %r596}, %rd134;
add.f64 %fd352, %fd403, 0d0000000000000000;
selp.f64 %fd353, %fd352, 0d0000000000000000, %p132;
mov.b64 %rd135, %fd353;
- mov.b64 {%r598, %r599}, %rd135;
-
- st.global.cs.v4.s32 [%rd132], {%r596,%r597,%r598,%r599};
+ mov.b64 {%r597, %r598}, %rd135;
+
+ st.global.cs.v4.s32 [%rd132], {%r595,%r596,%r597,%r598};
$L__BB0_132:
- add.s32 %r609, %r609, 1;
- setp.lt.s32 %p213, %r609, %r39;
+ add.s32 %r608, %r608, 1;
+ setp.lt.s32 %p213, %r608, %r38;
@%p213 bra $L__BB0_111;
$L__BB0_133:
ret;
6: CombinedSchedulerTest.LayerNormBackward/dtype_double_batch_216_hidden_65536
Kernel 3
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-9
+9 index type: int
registers: 60
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T11, Tensor<double, 2, 2> T10, Tensor<double, 2, 2> T7, Tensor<double, 1, 1> T14, Tensor<double, 2, 2> T20) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
double* T28 = reinterpret_cast<double*>(array + smem_offset + 9216);
double* T25 = reinterpret_cast<double*>(array + smem_offset + 8192);
double* T24 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T7.data;
s0.logical_size = T7.logical_size;
s0.alloc_stride = T7.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((24 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 16) * 2) + 1) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<double, 8, 2> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T27.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(2 * i6)], &T7[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<double, 8, 2> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T26.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(2 * i7)], &T10[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T24[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 2; ++i10) {
Array<double, 1, 1> T12;
T12[0] = 0;
T12[0]
= T25[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
Array<double, 1, 1> T17;
T17[0]
= T26[((2 * i9) + i10)]
- T12[0];
Array<double, 1, 1> T15;
T15[0] = 0;
T15[0]
= T28[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
Array<double, 1, 1> T16;
T16[0]
= T27[((2 * i9) + i10)]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<double, 1, 1> T19;
T19[0] = 0;
T19[0]
= d5
* T24[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
T29[((2 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i11 + nvfuser_zero)))], &T29[(2 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<double, 8, 2> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T27.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
if (((((1 + (2 * (((nvfuser_index_t)threadIdx.x) % 16))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(2 * i6)], &T7[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<double, 8, 2> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T26.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
if (((((1 + (2 * (((nvfuser_index_t)threadIdx.x) % 16))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(2 * i7)], &T10[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T24[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 2; ++i10) {
Array<double, 1, 1> T12;
T12[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T12[0]
= T25[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
}
Array<double, 1, 1> T17;
T17[0]
= T26[((2 * i9) + i10)]
- T12[0];
Array<double, 1, 1> T15;
T15[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T15[0]
= T28[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
}
Array<double, 1, 1> T16;
T16[0]
= T27[((2 * i9) + i10)]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<double, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T19[0]
= d5
* T24[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
}
T29[((2 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
if (((((1 + (2 * (((nvfuser_index_t)threadIdx.x) % 16))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i11 + nvfuser_zero)))], &T29[(2 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
__global__ void nvfuser_N(Tensor<double, 2, 2> T3, Tensor<double, 1, 1> T11, Tensor<double, 2, 2> T10, Tensor<double, 2, 2> T7, Tensor<double, 1, 1> T14, Tensor<double, 2, 2> T20) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
double* T28 = reinterpret_cast<double*>(array + smem_offset + 9216);
double* T25 = reinterpret_cast<double*>(array + smem_offset + 8192);
double* T24 = reinterpret_cast<double*>(array + smem_offset + 0);
Tensor<double, 2, 2> s0;
s0.data = T7.data;
s0.logical_size = T7.logical_size;
s0.alloc_stride = T7.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((24 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<double, 8, 2> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T27.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(2 * i6)], &T7[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<double, 8, 2> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T26.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(2 * i7)], &T10[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 2; ++i10) {
Array<double, 1, 1> T12;
T12[0] = 0;
T12[0]
= T25[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
Array<double, 1, 1> T17;
T17[0]
= T26[((2 * i9) + i10)]
- T12[0];
Array<double, 1, 1> T15;
T15[0] = 0;
T15[0]
= T28[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
Array<double, 1, 1> T16;
T16[0]
= T27[((2 * i9) + i10)]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<double, 1, 1> T19;
T19[0] = 0;
T19[0]
= d5
* T24[((((64 * (((nvfuser_index_t)threadIdx.x) % 16)) + (((nvfuser_index_t)threadIdx.x) / 16)) + (8 * i9)) + (32 * i10))];
T29[((2 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i11 + nvfuser_zero)))], &T29[(2 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<double, 8, 2> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T27.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
if (((((1 + (2 * (((nvfuser_index_t)threadIdx.x) % 16))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(2 * i6)], &T7[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<double, 8, 2> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T26.set(double(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
if (((((1 + (2 * (((nvfuser_index_t)threadIdx.x) % 16))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(2 * i7)], &T10[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 2; ++i10) {
Array<double, 1, 1> T12;
T12[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T12[0]
= T25[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
}
Array<double, 1, 1> T17;
T17[0]
= T26[((2 * i9) + i10)]
- T12[0];
Array<double, 1, 1> T15;
T15[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T15[0]
= T28[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
}
Array<double, 1, 1> T16;
T16[0]
= T27[((2 * i9) + i10)]
* T15[0];
Array<double, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<double, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T19[0]
= d5
* T24[((((64 * (((nvfuser_index_t)threadIdx.x) % 16)) + (((nvfuser_index_t)threadIdx.x) / 16)) + (8 * i9)) + (32 * i10))];
}
T29[((2 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
if (((((1 + (2 * (((nvfuser_index_t)threadIdx.x) % 16))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<double, /*vec_size=*/2, /*is_volatile=*/false>( &T20[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i11 + nvfuser_zero)))], &T29[(2 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -17,11 +17,11 @@
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
- if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((24 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 16) * 2) + 1) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
+ if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((24 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<double, 8, 2> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
@@ -46,11 +46,11 @@
loadGlobalToLocal<double, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(2 * i7)], &T10[(((((2 * (((nvfuser_index_t)threadIdx.x) % 16)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 16))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((8 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
- T24[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
@@ -60,19 +60,19 @@
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 2; ++i10) {
Array<double, 1, 1> T12;
T12[0] = 0;
T12[0]
- = T25[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
+ = T25[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
Array<double, 1, 1> T17;
T17[0]
= T26[((2 * i9) + i10)]
- T12[0];
Array<double, 1, 1> T15;
T15[0] = 0;
T15[0]
- = T28[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
+ = T28[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
Array<double, 1, 1> T16;
T16[0]
= T27[((2 * i9) + i10)]
* T15[0];
Array<double, 1, 1> T18;
@@ -81,11 +81,11 @@
- T16[0];
Array<double, 1, 1> T19;
T19[0] = 0;
T19[0]
= d5
- * T24[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
+ * T24[((((64 * (((nvfuser_index_t)threadIdx.x) % 16)) + (((nvfuser_index_t)threadIdx.x) / 16)) + (8 * i9)) + (32 * i10))];
T29[((2 * i9) + i10)]
= T19[0]
* T18[0];
}
}
@@ -131,11 +131,11 @@
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
- T24[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
@@ -147,21 +147,21 @@
for(nvfuser_index_t i10 = 0; i10 < 2; ++i10) {
Array<double, 1, 1> T12;
T12[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T12[0]
- = T25[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
+ = T25[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
}
Array<double, 1, 1> T17;
T17[0]
= T26[((2 * i9) + i10)]
- T12[0];
Array<double, 1, 1> T15;
T15[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T15[0]
- = T28[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
+ = T28[(((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9)) % 128)];
}
Array<double, 1, 1> T16;
T16[0]
= T27[((2 * i9) + i10)]
* T15[0];
@@ -172,11 +172,11 @@
Array<double, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 16)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(8 * (i9 + nvfuser_zero))))) {
T19[0]
= d5
- * T24[((((nvfuser_index_t)threadIdx.x) / 16) + (8 * i9))];
+ * T24[((((64 * (((nvfuser_index_t)threadIdx.x) % 16)) + (((nvfuser_index_t)threadIdx.x) / 16)) + (8 * i9)) + (32 * i10))];
}
T29[((2 * i9) + i10)]
= T19[0]
* T18[0];
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_1[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5[24]
)
{
.reg .pred %p<60>;
.reg .b32 %r<302>;
.reg .f64 %fd<305>;
.reg .b64 %rd<125>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r52, %r53}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0+16];
ld.param.v2.u32 {%r54, %r55}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3+8];
ld.param.u64 %rd13, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5];
ld.param.u64 %rd12, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4];
ld.param.u64 %rd11, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_2];
ld.param.u64 %rd14, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_1];
cvta.to.global.u64 %rd1, %rd11;
cvta.to.global.u64 %rd2, %rd14;
ld.param.u64 %rd4, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r62, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r62;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd15, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r63, [%rd15], %r2;
ld.shared.u32 %r4, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_1033910nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s];
cvt.rn.f64.s32 %fd106, %r55;
rcp.rn.f64 %fd1, %fd106;
mul.wide.s32 %rd16, %r2, 8;
mov.u64 %rd17, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_8d9f5466_103395arrayE;
add.s64 %rd6, %rd17, %rd16;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r64, %r55, 31;
shr.s32 %r65, %r64, 31;
shr.u32 %r66, %r65, 27;
add.s32 %r67, %r64, %r66;
shr.s32 %r6, %r67, 5;
div.s32 %r68, %r5, %r6;
shl.b32 %r7, %r68, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd18, %rd12;
mul.wide.s32 %rd19, %r8, 8;
add.s64 %rd8, %rd18, %rd19;
@%p3 bra $L__BB0_7;
shr.s32 %r69, %r2, 31;
shr.u32 %r70, %r69, 28;
add.s32 %r71, %r2, %r70;
shr.s32 %r9, %r71, 4;
add.s32 %r72, %r9, %r7;
add.s32 %r73, %r72, 24;
setp.gt.s32 %p4, %r73, 215;
@%p4 bra $L__BB0_7;
and.b32 %r77, %r71, 2147483632;
sub.s32 %r78, %r2, %r77;
shl.b32 %r10, %r78, 1;
rem.s32 %r79, %r5, %r6;
shl.b32 %r11, %r79, 5;
add.s32 %r80, %r10, %r11;
or.b32 %r81, %r80, 1;
setp.ge.s32 %p5, %r81, %r55;
@%p5 bra $L__BB0_7;
shr.u32 %r83, %r69, 27;
add.s32 %r84, %r2, %r83;
and.b32 %r85, %r84, -32;
sub.s32 %r12, %r2, %r85;
add.s32 %r13, %r7, %r12;
setp.lt.s32 %p6, %r13, 216;
@%p6 bra $L__BB0_98;
bra.uni $L__BB0_7;
$L__BB0_98:
ld.global.f64 %fd182, [%rd8];
st.shared.f64 [%rd6+9216], %fd182;
shl.b32 %r288, %r4, 4;
add.s32 %r289, %r7, %r9;
add.s32 %r290, %r289, %r288;
mad.lo.s32 %r291, %r290, %r55, %r10;
add.s32 %r292, %r291, %r11;
mul.wide.s32 %rd88, %r292, 8;
add.s64 %rd76, %rd4, %rd88;
// begin inline asm
ld.global.cs.v4.u32 {%r240,%r241,%r242,%r243}, [%rd76];
// end inline asm
mov.b64 %rd89, {%r240, %r241};
mov.b64 %fd183, %rd89;
mov.b64 %rd90, {%r242, %r243};
mov.b64 %fd184, %rd90;
shl.b32 %r293, %r55, 3;
mul.wide.s32 %rd91, %r293, 8;
add.s64 %rd77, %rd76, %rd91;
// begin inline asm
ld.global.cs.v4.u32 {%r244,%r245,%r246,%r247}, [%rd77];
// end inline asm
mov.b64 %rd92, {%r244, %r245};
mov.b64 %fd185, %rd92;
mov.b64 %rd93, {%r246, %r247};
mov.b64 %fd186, %rd93;
add.s64 %rd78, %rd77, %rd91;
// begin inline asm
ld.global.cs.v4.u32 {%r248,%r249,%r250,%r251}, [%rd78];
// end inline asm
mov.b64 %rd94, {%r248, %r249};
mov.b64 %fd187, %rd94;
mov.b64 %rd95, {%r250, %r251};
mov.b64 %fd188, %rd95;
add.s64 %rd79, %rd78, %rd91;
// begin inline asm
ld.global.cs.v4.u32 {%r252,%r253,%r254,%r255}, [%rd79];
// end inline asm
mov.b64 %rd96, {%r252, %r253};
mov.b64 %fd189, %rd96;
mov.b64 %rd97, {%r254, %r255};
mov.b64 %fd190, %rd97;
shl.b64 %rd98, %rd7, 3;
add.s64 %rd99, %rd2, %rd98;
ld.global.f64 %fd191, [%rd99];
st.shared.f64 [%rd6+8192], %fd191;
mad.lo.s32 %r294, %r4, 48, %r290;
mad.lo.s32 %r295, %r294, %r55, %r10;
add.s32 %r296, %r295, %r11;
mul.wide.s32 %rd100, %r296, 8;
add.s64 %rd80, %rd3, %rd100;
// begin inline asm
ld.global.cs.v4.u32 {%r256,%r257,%r258,%r259}, [%rd80];
// end inline asm
mov.b64 %rd101, {%r256, %r257};
mov.b64 %fd192, %rd101;
mov.b64 %rd102, {%r258, %r259};
mov.b64 %fd193, %rd102;
add.s64 %rd81, %rd80, %rd91;
// begin inline asm
ld.global.cs.v4.u32 {%r260,%r261,%r262,%r263}, [%rd81];
// end inline asm
mov.b64 %rd103, {%r260, %r261};
mov.b64 %fd194, %rd103;
mov.b64 %rd104, {%r262, %r263};
mov.b64 %fd195, %rd104;
add.s64 %rd82, %rd81, %rd91;
// begin inline asm
ld.global.cs.v4.u32 {%r264,%r265,%r266,%r267}, [%rd82];
// end inline asm
mov.b64 %rd105, {%r264, %r265};
mov.b64 %fd196, %rd105;
mov.b64 %rd106, {%r266, %r267};
mov.b64 %fd197, %rd106;
add.s64 %rd83, %rd82, %rd91;
// begin inline asm
ld.global.cs.v4.u32 {%r268,%r269,%r270,%r271}, [%rd83];
// end inline asm
mov.b64 %rd107, {%r268, %r269};
mov.b64 %fd198, %rd107;
mov.b64 %rd108, {%r270, %r271};
mov.b64 %fd199, %rd108;
mul.lo.s32 %r297, %r13, %r52;
mul.wide.s32 %rd109, %r297, 8;
add.s64 %rd110, %rd1, %rd109;
mul.wide.s32 %rd111, %r12, 8;
add.s64 %rd113, %rd17, %rd111;
ld.global.f64 %fd200, [%rd110];
st.shared.f64 [%rd113], %fd200;
barrier.sync 0;
mul.wide.s32 %rd114, %r9, 8;
add.s64 %rd115, %rd17, %rd114;
ld.shared.f64 %fd201, [%rd115];
mul.f64 %fd202, %fd1, %fd201;
ld.shared.f64 %fd203, [%rd115+8192];
sub.f64 %fd204, %fd192, %fd203;
ld.shared.f64 %fd205, [%rd115+9216];
mul.f64 %fd206, %fd205, %fd183;
sub.f64 %fd207, %fd204, %fd206;
mul.f64 %fd208, %fd202, %fd207;
mov.b64 %rd116, %fd208;
sub.f64 %fd209, %fd193, %fd203;
mul.f64 %fd210, %fd205, %fd184;
sub.f64 %fd211, %fd209, %fd210;
mul.f64 %fd212, %fd202, %fd211;
mov.b64 %rd117, %fd212;
ld.shared.f64 %fd213, [%rd115+64];
mul.f64 %fd214, %fd1, %fd213;
ld.shared.f64 %fd215, [%rd115+8256];
sub.f64 %fd216, %fd194, %fd215;
ld.shared.f64 %fd217, [%rd115+9280];
mul.f64 %fd218, %fd217, %fd185;
sub.f64 %fd219, %fd216, %fd218;
mul.f64 %fd220, %fd214, %fd219;
mov.b64 %rd118, %fd220;
sub.f64 %fd221, %fd195, %fd215;
mul.f64 %fd222, %fd217, %fd186;
sub.f64 %fd223, %fd221, %fd222;
mul.f64 %fd224, %fd214, %fd223;
mov.b64 %rd119, %fd224;
ld.shared.f64 %fd225, [%rd115+128];
mul.f64 %fd226, %fd1, %fd225;
ld.shared.f64 %fd227, [%rd115+8320];
sub.f64 %fd228, %fd196, %fd227;
ld.shared.f64 %fd229, [%rd115+9344];
mul.f64 %fd230, %fd229, %fd187;
sub.f64 %fd231, %fd228, %fd230;
mul.f64 %fd232, %fd226, %fd231;
mov.b64 %rd120, %fd232;
sub.f64 %fd233, %fd197, %fd227;
mul.f64 %fd234, %fd229, %fd188;
sub.f64 %fd235, %fd233, %fd234;
mul.f64 %fd236, %fd226, %fd235;
mov.b64 %rd121, %fd236;
ld.shared.f64 %fd237, [%rd115+192];
mul.f64 %fd238, %fd1, %fd237;
ld.shared.f64 %fd239, [%rd115+8384];
sub.f64 %fd240, %fd198, %fd239;
ld.shared.f64 %fd241, [%rd115+9408];
mul.f64 %fd242, %fd241, %fd189;
sub.f64 %fd243, %fd240, %fd242;
mul.f64 %fd244, %fd238, %fd243;
mov.b64 %rd122, %fd244;
sub.f64 %fd245, %fd199, %fd239;
mul.f64 %fd246, %fd241, %fd190;
sub.f64 %fd247, %fd245, %fd246;
mul.f64 %fd248, %fd238, %fd247;
mov.b64 %rd123, %fd248;
shl.b32 %r298, %r4, 9;
add.s32 %r299, %r289, %r298;
mad.lo.s32 %r300, %r299, %r55, %r10;
add.s32 %r301, %r300, %r11;
mul.wide.s32 %rd124, %r301, 8;
add.s64 %rd84, %rd13, %rd124;
mov.b64 {%r272, %r273}, %rd116;
mov.b64 {%r274, %r275}, %rd117;
// begin inline asm
st.global.cs.v4.s32 [%rd84], {%r272,%r273,%r274,%r275};
// end inline asm
mov.b64 {%r276, %r277}, %rd118;
mov.b64 {%r278, %r279}, %rd119;
add.s64 %rd85, %rd84, %rd91;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r276,%r277,%r278,%r279};
// end inline asm
mov.b64 {%r280, %r281}, %rd120;
mov.b64 {%r282, %r283}, %rd121;
add.s64 %rd86, %rd85, %rd91;
// begin inline asm
st.global.cs.v4.s32 [%rd86], {%r280,%r281,%r282,%r283};
// end inline asm
mov.b64 {%r284, %r285}, %rd122;
mov.b64 {%r286, %r287}, %rd123;
add.s64 %rd87, %rd86, %rd91;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r284,%r285,%r286,%r287};
// end inline asm
bra.uni $L__BB0_99;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f64 %fd107, [%rd8];
st.shared.f64 [%rd6+9216], %fd107;
$L__BB0_9:
mov.u32 %r14, %ctaid.x;
add.s32 %r86, %r55, 31;
shr.s32 %r87, %r86, 31;
shr.u32 %r88, %r87, 27;
add.s32 %r89, %r86, %r88;
shr.s32 %r15, %r89, 5;
shl.b32 %r16, %r4, 4;
shr.s32 %r90, %r2, 31;
shr.u32 %r91, %r90, 28;
add.s32 %r92, %r2, %r91;
and.b32 %r93, %r92, 2147483632;
sub.s32 %r94, %r2, %r93;
shl.b32 %r95, %r94, 1;
rem.s32 %r96, %r14, %r15;
shl.b32 %r97, %r96, 5;
add.s32 %r20, %r97, %r95;
or.b32 %r17, %r20, 1;
setp.ge.s32 %p8, %r17, %r55;
shr.s32 %r18, %r92, 4;
add.s32 %r19, %r18, -216;
mov.f64 %fd265, 0d0000000000000000;
mov.f64 %fd267, 0d0000000000000000;
mov.f64 %fd266, %fd267;
@%p8 bra $L__BB0_12;
div.s32 %r98, %r14, %r15;
shl.b32 %r21, %r98, 5;
add.s32 %r99, %r19, %r21;
neg.s32 %r100, %r16;
setp.ge.s32 %p9, %r99, %r100;
@%p9 bra $L__BB0_12;
add.s32 %r105, %r16, %r18;
add.s32 %r106, %r105, %r21;
mad.lo.s32 %r107, %r106, %r55, %r20;
mul.wide.s32 %rd21, %r107, 8;
add.s64 %rd20, %rd4, %rd21;
// begin inline asm
ld.global.cs.v4.u32 {%r101,%r102,%r103,%r104}, [%rd20];
// end inline asm
mov.b64 %rd22, {%r101, %r102};
mov.b64 %fd266, %rd22;
mov.b64 %rd23, {%r103, %r104};
mov.b64 %fd265, %rd23;
$L__BB0_12:
mov.f64 %fd268, %fd267;
@%p8 bra $L__BB0_15;
div.s32 %r108, %r14, %r15;
shl.b32 %r22, %r108, 5;
add.s32 %r109, %r19, %r22;
mov.u32 %r110, -8;
sub.s32 %r111, %r110, %r16;
setp.ge.s32 %p11, %r109, %r111;
mov.f64 %fd268, %fd267;
@%p11 bra $L__BB0_15;
add.s32 %r116, %r16, %r18;
add.s32 %r117, %r116, %r22;
add.s32 %r118, %r117, 8;
mad.lo.s32 %r119, %r118, %r55, %r20;
mul.wide.s32 %rd25, %r119, 8;
add.s64 %rd24, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r112,%r113,%r114,%r115}, [%rd24];
// end inline asm
mov.b64 %rd26, {%r112, %r113};
mov.b64 %fd268, %rd26;
mov.b64 %rd27, {%r114, %r115};
mov.b64 %fd267, %rd27;
$L__BB0_15:
mov.f64 %fd269, 0d0000000000000000;
mov.f64 %fd271, 0d0000000000000000;
mov.f64 %fd270, %fd271;
@%p8 bra $L__BB0_18;
div.s32 %r120, %r14, %r15;
shl.b32 %r23, %r120, 5;
add.s32 %r121, %r19, %r23;
mov.u32 %r122, -16;
sub.s32 %r123, %r122, %r16;
setp.ge.s32 %p13, %r121, %r123;
@%p13 bra $L__BB0_18;
add.s32 %r128, %r16, %r18;
add.s32 %r129, %r128, %r23;
add.s32 %r130, %r129, 16;
mad.lo.s32 %r131, %r130, %r55, %r20;
mul.wide.s32 %rd29, %r131, 8;
add.s64 %rd28, %rd4, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r124,%r125,%r126,%r127}, [%rd28];
// end inline asm
mov.b64 %rd30, {%r124, %r125};
mov.b64 %fd270, %rd30;
mov.b64 %rd31, {%r126, %r127};
mov.b64 %fd269, %rd31;
$L__BB0_18:
mov.f64 %fd272, %fd271;
@%p8 bra $L__BB0_21;
div.s32 %r132, %r14, %r15;
shl.b32 %r24, %r132, 5;
add.s32 %r133, %r19, %r24;
mov.u32 %r134, -24;
sub.s32 %r135, %r134, %r16;
setp.ge.s32 %p15, %r133, %r135;
mov.f64 %fd272, %fd271;
@%p15 bra $L__BB0_21;
add.s32 %r140, %r16, %r18;
add.s32 %r141, %r140, %r24;
add.s32 %r142, %r141, 24;
mad.lo.s32 %r143, %r142, %r55, %r20;
mul.wide.s32 %rd33, %r143, 8;
add.s64 %rd32, %rd4, %rd33;
// begin inline asm
ld.global.cs.v4.u32 {%r136,%r137,%r138,%r139}, [%rd32];
// end inline asm
mov.b64 %rd34, {%r136, %r137};
mov.b64 %fd272, %rd34;
mov.b64 %rd35, {%r138, %r139};
mov.b64 %fd271, %rd35;
$L__BB0_21:
shl.b32 %r25, %r4, 6;
setp.gt.s32 %p16, %r2, 31;
@%p16 bra $L__BB0_24;
div.s32 %r144, %r14, %r15;
shl.b32 %r145, %r144, 5;
add.s32 %r26, %r145, %r2;
setp.gt.s32 %p17, %r26, 215;
@%p17 bra $L__BB0_24;
mul.wide.s32 %rd36, %r26, 8;
add.s64 %rd37, %rd2, %rd36;
ld.global.f64 %fd124, [%rd37];
st.shared.f64 [%rd6+8192], %fd124;
$L__BB0_24:
mov.f64 %fd273, 0d0000000000000000;
mov.f64 %fd275, 0d0000000000000000;
mov.f64 %fd274, %fd275;
@%p8 bra $L__BB0_27;
div.s32 %r146, %r14, %r15;
shl.b32 %r27, %r146, 5;
add.s32 %r147, %r19, %r27;
neg.s32 %r148, %r25;
setp.ge.s32 %p19, %r147, %r148;
@%p19 bra $L__BB0_27;
add.s32 %r153, %r25, %r18;
add.s32 %r154, %r153, %r27;
mad.lo.s32 %r155, %r154, %r55, %r20;
mul.wide.s32 %rd39, %r155, 8;
add.s64 %rd38, %rd3, %rd39;
// begin inline asm
ld.global.cs.v4.u32 {%r149,%r150,%r151,%r152}, [%rd38];
// end inline asm
mov.b64 %rd40, {%r149, %r150};
mov.b64 %fd274, %rd40;
mov.b64 %rd41, {%r151, %r152};
mov.b64 %fd273, %rd41;
$L__BB0_27:
mov.f64 %fd276, %fd275;
@%p8 bra $L__BB0_30;
div.s32 %r156, %r14, %r15;
shl.b32 %r28, %r156, 5;
add.s32 %r157, %r19, %r28;
mov.u32 %r158, -8;
sub.s32 %r159, %r158, %r25;
setp.ge.s32 %p21, %r157, %r159;
mov.f64 %fd276, %fd275;
@%p21 bra $L__BB0_30;
add.s32 %r164, %r25, %r18;
add.s32 %r165, %r164, %r28;
add.s32 %r166, %r165, 8;
mad.lo.s32 %r167, %r166, %r55, %r20;
mul.wide.s32 %rd43, %r167, 8;
add.s64 %rd42, %rd3, %rd43;
// begin inline asm
ld.global.cs.v4.u32 {%r160,%r161,%r162,%r163}, [%rd42];
// end inline asm
mov.b64 %rd44, {%r160, %r161};
mov.b64 %fd276, %rd44;
mov.b64 %rd45, {%r162, %r163};
mov.b64 %fd275, %rd45;
$L__BB0_30:
mov.f64 %fd277, 0d0000000000000000;
mov.f64 %fd279, 0d0000000000000000;
mov.f64 %fd278, %fd279;
@%p8 bra $L__BB0_33;
div.s32 %r168, %r14, %r15;
shl.b32 %r29, %r168, 5;
add.s32 %r169, %r19, %r29;
mov.u32 %r170, -16;
sub.s32 %r171, %r170, %r25;
setp.ge.s32 %p23, %r169, %r171;
@%p23 bra $L__BB0_33;
add.s32 %r176, %r25, %r18;
add.s32 %r177, %r176, %r29;
add.s32 %r178, %r177, 16;
mad.lo.s32 %r179, %r178, %r55, %r20;
mul.wide.s32 %rd47, %r179, 8;
add.s64 %rd46, %rd3, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r172,%r173,%r174,%r175}, [%rd46];
// end inline asm
mov.b64 %rd48, {%r172, %r173};
mov.b64 %fd278, %rd48;
mov.b64 %rd49, {%r174, %r175};
mov.b64 %fd277, %rd49;
$L__BB0_33:
setp.lt.s32 %p24, %r17, %r55;
@%p24 bra $L__BB0_35;
bra.uni $L__BB0_34;
$L__BB0_35:
div.s32 %r180, %r14, %r15;
shl.b32 %r30, %r180, 5;
add.s32 %r181, %r19, %r30;
mov.u32 %r182, -24;
sub.s32 %r183, %r182, %r25;
setp.ge.s32 %p25, %r181, %r183;
mov.f64 %fd280, %fd279;
@%p25 bra $L__BB0_37;
add.s32 %r188, %r25, %r18;
add.s32 %r189, %r188, %r30;
add.s32 %r190, %r189, 24;
mad.lo.s32 %r191, %r190, %r55, %r20;
mul.wide.s32 %rd51, %r191, 8;
add.s64 %rd50, %rd3, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r184,%r185,%r186,%r187}, [%rd50];
// end inline asm
mov.b64 %rd52, {%r184, %r185};
mov.b64 %fd280, %rd52;
mov.b64 %rd53, {%r186, %r187};
mov.b64 %fd279, %rd53;
bra.uni $L__BB0_37;
$L__BB0_34:
mov.f64 %fd280, %fd279;
$L__BB0_37:
div.s32 %r192, %r14, %r15;
shl.b32 %r31, %r192, 5;
shr.u32 %r194, %r90, 27;
add.s32 %r195, %r2, %r194;
and.b32 %r196, %r195, -32;
sub.s32 %r32, %r2, %r196;
add.s32 %r197, %r31, %r32;
setp.gt.s32 %p26, %r197, 215;
mul.lo.s32 %r198, %r197, %r52;
mul.wide.s32 %rd54, %r198, 8;
add.s64 %rd9, %rd1, %rd54;
@%p26 bra $L__BB0_39;
mul.wide.s32 %rd55, %r32, 8;
add.s64 %rd57, %rd17, %rd55;
ld.global.f64 %fd141, [%rd9];
st.shared.f64 [%rd57], %fd141;
$L__BB0_39:
shl.b32 %r33, %r4, 8;
barrier.sync 0;
neg.s32 %r34, %r33;
add.s32 %r35, %r19, %r31;
setp.ge.s32 %p27, %r35, %r34;
mul.wide.s32 %rd58, %r18, 8;
add.s64 %rd10, %rd17, %rd58;
mov.f64 %fd282, 0d0000000000000000;
mov.f64 %fd281, %fd282;
@%p27 bra $L__BB0_41;
ld.shared.f64 %fd281, [%rd10+8192];
$L__BB0_41:
sub.f64 %fd36, %fd274, %fd281;
@%p27 bra $L__BB0_43;
ld.shared.f64 %fd282, [%rd10+9216];
$L__BB0_43:
mul.f64 %fd145, %fd282, %fd266;
sub.f64 %fd39, %fd36, %fd145;
mov.f64 %fd284, 0d0000000000000000;
mov.f64 %fd283, %fd284;
@%p27 bra $L__BB0_45;
ld.shared.f64 %fd146, [%rd10];
mul.f64 %fd283, %fd1, %fd146;
$L__BB0_45:
mul.f64 %fd42, %fd39, %fd283;
@%p27 bra $L__BB0_47;
ld.shared.f64 %fd284, [%rd10+8192];
$L__BB0_47:
sub.f64 %fd45, %fd273, %fd284;
mov.f64 %fd286, 0d0000000000000000;
mov.f64 %fd285, %fd286;
@%p27 bra $L__BB0_49;
ld.shared.f64 %fd285, [%rd10+9216];
$L__BB0_49:
mul.f64 %fd150, %fd285, %fd265;
sub.f64 %fd48, %fd45, %fd150;
@%p27 bra $L__BB0_51;
ld.shared.f64 %fd151, [%rd10];
mul.f64 %fd286, %fd1, %fd151;
$L__BB0_51:
mul.f64 %fd51, %fd48, %fd286;
mov.u32 %r199, -8;
sub.s32 %r36, %r199, %r33;
setp.ge.s32 %p33, %r35, %r36;
mov.f64 %fd288, 0d0000000000000000;
mov.f64 %fd287, %fd288;
@%p33 bra $L__BB0_53;
ld.shared.f64 %fd287, [%rd10+8256];
$L__BB0_53:
sub.f64 %fd54, %fd276, %fd287;
@%p33 bra $L__BB0_55;
ld.shared.f64 %fd288, [%rd10+9280];
$L__BB0_55:
mul.f64 %fd155, %fd288, %fd268;
sub.f64 %fd57, %fd54, %fd155;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd289, %fd290;
@%p33 bra $L__BB0_57;
ld.shared.f64 %fd156, [%rd10+64];
mul.f64 %fd289, %fd1, %fd156;
$L__BB0_57:
mul.f64 %fd60, %fd57, %fd289;
@%p33 bra $L__BB0_59;
ld.shared.f64 %fd290, [%rd10+8256];
$L__BB0_59:
sub.f64 %fd63, %fd275, %fd290;
mov.f64 %fd292, 0d0000000000000000;
mov.f64 %fd291, %fd292;
@%p33 bra $L__BB0_61;
ld.shared.f64 %fd291, [%rd10+9280];
$L__BB0_61:
mul.f64 %fd160, %fd291, %fd267;
sub.f64 %fd66, %fd63, %fd160;
@%p33 bra $L__BB0_63;
ld.shared.f64 %fd161, [%rd10+64];
mul.f64 %fd292, %fd1, %fd161;
$L__BB0_63:
mul.f64 %fd69, %fd66, %fd292;
mov.u32 %r200, -16;
sub.s32 %r37, %r200, %r33;
setp.ge.s32 %p39, %r35, %r37;
mov.f64 %fd294, 0d0000000000000000;
mov.f64 %fd293, %fd294;
@%p39 bra $L__BB0_65;
ld.shared.f64 %fd293, [%rd10+8320];
$L__BB0_65:
sub.f64 %fd72, %fd278, %fd293;
@%p39 bra $L__BB0_67;
ld.shared.f64 %fd294, [%rd10+9344];
$L__BB0_67:
mul.f64 %fd165, %fd294, %fd270;
sub.f64 %fd75, %fd72, %fd165;
mov.f64 %fd296, 0d0000000000000000;
mov.f64 %fd295, %fd296;
@%p39 bra $L__BB0_69;
ld.shared.f64 %fd166, [%rd10+128];
mul.f64 %fd295, %fd1, %fd166;
$L__BB0_69:
mul.f64 %fd78, %fd75, %fd295;
@%p39 bra $L__BB0_71;
ld.shared.f64 %fd296, [%rd10+8320];
$L__BB0_71:
sub.f64 %fd81, %fd277, %fd296;
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd297, %fd298;
@%p39 bra $L__BB0_73;
ld.shared.f64 %fd297, [%rd10+9344];
$L__BB0_73:
mul.f64 %fd170, %fd297, %fd269;
sub.f64 %fd84, %fd81, %fd170;
@%p39 bra $L__BB0_75;
ld.shared.f64 %fd171, [%rd10+128];
mul.f64 %fd298, %fd1, %fd171;
$L__BB0_75:
mul.f64 %fd87, %fd84, %fd298;
mov.u32 %r201, -24;
sub.s32 %r38, %r201, %r33;
setp.ge.s32 %p45, %r35, %r38;
mov.f64 %fd300, 0d0000000000000000;
mov.f64 %fd299, %fd300;
@%p45 bra $L__BB0_77;
ld.shared.f64 %fd299, [%rd10+8384];
$L__BB0_77:
sub.f64 %fd90, %fd280, %fd299;
@%p45 bra $L__BB0_79;
ld.shared.f64 %fd300, [%rd10+9408];
$L__BB0_79:
mul.f64 %fd175, %fd300, %fd272;
sub.f64 %fd93, %fd90, %fd175;
mov.f64 %fd302, 0d0000000000000000;
mov.f64 %fd301, %fd302;
@%p45 bra $L__BB0_81;
ld.shared.f64 %fd176, [%rd10+192];
mul.f64 %fd301, %fd1, %fd176;
$L__BB0_81:
mul.f64 %fd96, %fd93, %fd301;
@%p45 bra $L__BB0_83;
ld.shared.f64 %fd302, [%rd10+8384];
$L__BB0_83:
sub.f64 %fd99, %fd279, %fd302;
mov.f64 %fd304, 0d0000000000000000;
mov.f64 %fd303, %fd304;
@%p45 bra $L__BB0_85;
ld.shared.f64 %fd303, [%rd10+9408];
$L__BB0_85:
mul.f64 %fd180, %fd303, %fd271;
sub.f64 %fd102, %fd99, %fd180;
@%p45 bra $L__BB0_87;
ld.shared.f64 %fd181, [%rd10+192];
mul.f64 %fd304, %fd1, %fd181;
$L__BB0_87:
mul.f64 %fd105, %fd102, %fd304;
shl.b32 %r39, %r4, 9;
@%p8 bra $L__BB0_90;
neg.s32 %r202, %r39;
setp.ge.s32 %p52, %r35, %r202;
@%p52 bra $L__BB0_90;
add.s32 %r207, %r39, %r18;
add.s32 %r208, %r207, %r31;
mad.lo.s32 %r209, %r208, %r55, %r20;
mul.wide.s32 %rd61, %r209, 8;
add.s64 %rd60, %rd13, %rd61;
mov.b64 %rd62, %fd42;
mov.b64 {%r203, %r204}, %rd62;
mov.b64 %rd63, %fd51;
mov.b64 {%r205, %r206}, %rd63;
// begin inline asm
st.global.cs.v4.s32 [%rd60], {%r203,%r204,%r205,%r206};
// end inline asm
$L__BB0_90:
mov.u32 %r210, -8;
sub.s32 %r211, %r210, %r39;
setp.ge.s32 %p54, %r35, %r211;
or.pred %p55, %p8, %p54;
@%p55 bra $L__BB0_92;
add.s32 %r216, %r39, %r18;
add.s32 %r217, %r216, %r31;
add.s32 %r218, %r217, 8;
mad.lo.s32 %r219, %r218, %r55, %r20;
mul.wide.s32 %rd65, %r219, 8;
add.s64 %rd64, %rd13, %rd65;
mov.b64 %rd66, %fd60;
mov.b64 {%r212, %r213}, %rd66;
mov.b64 %rd67, %fd69;
mov.b64 {%r214, %r215}, %rd67;
// begin inline asm
st.global.cs.v4.s32 [%rd64], {%r212,%r213,%r214,%r215};
// end inline asm
$L__BB0_92:
@%p8 bra $L__BB0_95;
mov.u32 %r220, -16;
sub.s32 %r221, %r220, %r39;
setp.ge.s32 %p57, %r35, %r221;
@%p57 bra $L__BB0_95;
add.s32 %r226, %r39, %r18;
add.s32 %r227, %r226, %r31;
add.s32 %r228, %r227, 16;
mad.lo.s32 %r229, %r228, %r55, %r20;
mul.wide.s32 %rd69, %r229, 8;
add.s64 %rd68, %rd13, %rd69;
mov.b64 %rd70, %fd78;
mov.b64 {%r222, %r223}, %rd70;
mov.b64 %rd71, %fd87;
mov.b64 {%r224, %r225}, %rd71;
// begin inline asm
st.global.cs.v4.s32 [%rd68], {%r222,%r223,%r224,%r225};
// end inline asm
$L__BB0_95:
@%p8 bra $L__BB0_99;
mov.u32 %r230, -24;
sub.s32 %r231, %r230, %r39;
setp.ge.s32 %p59, %r35, %r231;
@%p59 bra $L__BB0_99;
add.s32 %r236, %r39, %r18;
add.s32 %r237, %r236, %r31;
add.s32 %r238, %r237, 24;
mad.lo.s32 %r239, %r238, %r55, %r20;
mul.wide.s32 %rd73, %r239, 8;
add.s64 %rd72, %rd13, %rd73;
mov.b64 %rd74, %fd96;
mov.b64 {%r232, %r233}, %rd74;
mov.b64 %rd75, %fd105;
mov.b64 {%r234, %r235}, %rd75;
// begin inline asm
st.global.cs.v4.s32 [%rd72], {%r232,%r233,%r234,%r235};
// end inline asm
$L__BB0_99:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_1[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5[24]
)
{
.reg .pred %p<60>;
.reg .b32 %r<347>;
.reg .f64 %fd<313>;
.reg .b64 %rd<145>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r54, %r55}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0+16];
ld.param.v2.u32 {%r56, %r57}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3+8];
ld.param.u64 %rd17, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5];
ld.param.u64 %rd16, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4];
ld.param.u64 %rd15, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0];
ld.param.u64 %rd3, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_2];
ld.param.u64 %rd18, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_1];
cvta.to.global.u64 %rd1, %rd15;
cvta.to.global.u64 %rd2, %rd18;
ld.param.u64 %rd4, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r64, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r64;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd19, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r65, [%rd19], %r2;
ld.shared.u32 %r4, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_723310nvfuser_13ENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s];
cvt.rn.f64.s32 %fd106, %r57;
rcp.rn.f64 %fd1, %fd106;
mul.wide.s32 %rd20, %r2, 8;
mov.u64 %rd21, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_13_cu_9e673cb0_72335arrayE;
add.s64 %rd6, %rd21, %rd20;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r66, %r57, 31;
shr.s32 %r67, %r66, 31;
shr.u32 %r68, %r67, 27;
add.s32 %r69, %r66, %r68;
shr.s32 %r6, %r69, 5;
div.s32 %r70, %r5, %r6;
shl.b32 %r7, %r70, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd22, %rd16;
mul.wide.s32 %rd23, %r8, 8;
add.s64 %rd8, %rd22, %rd23;
@%p3 bra $L__BB0_7;
shr.s32 %r71, %r2, 31;
shr.u32 %r72, %r71, 28;
add.s32 %r73, %r2, %r72;
shr.s32 %r9, %r73, 4;
add.s32 %r10, %r9, 24;
add.s32 %r74, %r10, %r7;
setp.gt.s32 %p4, %r74, 215;
@%p4 bra $L__BB0_7;
rem.s32 %r75, %r5, %r6;
shl.b32 %r11, %r75, 5;
or.b32 %r76, %r11, 31;
setp.ge.s32 %p5, %r76, %r57;
@%p5 bra $L__BB0_7;
shr.u32 %r78, %r71, 27;
add.s32 %r79, %r2, %r78;
and.b32 %r80, %r79, -32;
sub.s32 %r81, %r2, %r80;
add.s32 %r12, %r7, %r81;
setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_98;
bra.uni $L__BB0_7;
$L__BB0_98:
ld.global.f64 %fd182, [%rd8];
st.shared.f64 [%rd6+9216], %fd182;
and.b32 %r306, %r73, -16;
sub.s32 %r307, %r2, %r306;
shl.b32 %r308, %r307, 1;
shl.b32 %r309, %r4, 4;
add.s32 %r310, %r7, %r9;
add.s32 %r311, %r310, %r309;
mad.lo.s32 %r312, %r311, %r57, %r308;
add.s32 %r313, %r312, %r11;
mul.wide.s32 %rd101, %r313, 8;
add.s64 %rd89, %rd4, %rd101;
// begin inline asm
ld.global.cs.v4.u32 {%r255,%r256,%r257,%r258}, [%rd89];
// end inline asm
mov.b64 %rd102, {%r255, %r256};
mov.b64 %fd183, %rd102;
mov.b64 %rd103, {%r257, %r258};
mov.b64 %fd184, %rd103;
shl.b32 %r314, %r57, 3;
mul.wide.s32 %rd104, %r314, 8;
add.s64 %rd90, %rd89, %rd104;
// begin inline asm
ld.global.cs.v4.u32 {%r259,%r260,%r261,%r262}, [%rd90];
// end inline asm
mov.b64 %rd105, {%r259, %r260};
mov.b64 %fd185, %rd105;
mov.b64 %rd106, {%r261, %r262};
mov.b64 %fd186, %rd106;
add.s64 %rd91, %rd90, %rd104;
// begin inline asm
ld.global.cs.v4.u32 {%r263,%r264,%r265,%r266}, [%rd91];
// end inline asm
mov.b64 %rd107, {%r263, %r264};
mov.b64 %fd187, %rd107;
mov.b64 %rd108, {%r265, %r266};
mov.b64 %fd188, %rd108;
add.s64 %rd92, %rd91, %rd104;
// begin inline asm
ld.global.cs.v4.u32 {%r267,%r268,%r269,%r270}, [%rd92];
// end inline asm
mov.b64 %rd109, {%r267, %r268};
mov.b64 %fd189, %rd109;
mov.b64 %rd110, {%r269, %r270};
mov.b64 %fd190, %rd110;
shl.b64 %rd111, %rd7, 3;
add.s64 %rd112, %rd2, %rd111;
ld.global.f64 %fd191, [%rd112];
st.shared.f64 [%rd6+8192], %fd191;
mad.lo.s32 %r315, %r4, 48, %r311;
mad.lo.s32 %r316, %r315, %r57, %r308;
add.s32 %r317, %r316, %r11;
mul.wide.s32 %rd113, %r317, 8;
add.s64 %rd93, %rd3, %rd113;
// begin inline asm
ld.global.cs.v4.u32 {%r271,%r272,%r273,%r274}, [%rd93];
// end inline asm
mov.b64 %rd114, {%r271, %r272};
mov.b64 %fd192, %rd114;
mov.b64 %rd115, {%r273, %r274};
mov.b64 %fd193, %rd115;
add.s64 %rd94, %rd93, %rd104;
// begin inline asm
ld.global.cs.v4.u32 {%r275,%r276,%r277,%r278}, [%rd94];
// end inline asm
mov.b64 %rd116, {%r275, %r276};
mov.b64 %fd194, %rd116;
mov.b64 %rd117, {%r277, %r278};
mov.b64 %fd195, %rd117;
add.s64 %rd95, %rd94, %rd104;
// begin inline asm
ld.global.cs.v4.u32 {%r279,%r280,%r281,%r282}, [%rd95];
// end inline asm
mov.b64 %rd118, {%r279, %r280};
mov.b64 %fd196, %rd118;
mov.b64 %rd119, {%r281, %r282};
mov.b64 %fd197, %rd119;
add.s64 %rd96, %rd95, %rd104;
// begin inline asm
ld.global.cs.v4.u32 {%r283,%r284,%r285,%r286}, [%rd96];
// end inline asm
mov.b64 %rd120, {%r283, %r284};
mov.b64 %fd198, %rd120;
mov.b64 %rd121, {%r285, %r286};
mov.b64 %fd199, %rd121;
mul.lo.s32 %r318, %r12, %r54;
mul.wide.s32 %rd122, %r318, 8;
add.s64 %rd123, %rd1, %rd122;
ld.global.f64 %fd200, [%rd123];
st.shared.f64 [%rd6], %fd200;
st.shared.f64 [%rd6+1024], %fd200;
st.shared.f64 [%rd6+2048], %fd200;
st.shared.f64 [%rd6+3072], %fd200;
st.shared.f64 [%rd6+4096], %fd200;
st.shared.f64 [%rd6+5120], %fd200;
st.shared.f64 [%rd6+6144], %fd200;
st.shared.f64 [%rd6+7168], %fd200;
barrier.sync 0;
shl.b32 %r319, %r307, 6;
add.s32 %r320, %r319, %r9;
shr.s32 %r321, %r9, 31;
shr.u32 %r322, %r321, 25;
add.s32 %r323, %r9, %r322;
and.b32 %r324, %r323, -128;
sub.s32 %r325, %r9, %r324;
mul.wide.s32 %rd124, %r325, 8;
add.s64 %rd126, %rd21, 8192;
add.s64 %rd127, %rd126, %rd124;
ld.shared.f64 %fd201, [%rd127];
sub.f64 %fd202, %fd192, %fd201;
ld.shared.f64 %fd203, [%rd127+1024];
mul.f64 %fd204, %fd203, %fd183;
sub.f64 %fd205, %fd202, %fd204;
mul.wide.s32 %rd128, %r320, 8;
add.s64 %rd129, %rd21, %rd128;
ld.shared.f64 %fd206, [%rd129];
mul.f64 %fd207, %fd1, %fd206;
mul.f64 %fd208, %fd207, %fd205;
mov.b64 %rd130, %fd208;
sub.f64 %fd209, %fd193, %fd201;
mul.f64 %fd210, %fd203, %fd184;
sub.f64 %fd211, %fd209, %fd210;
ld.shared.f64 %fd212, [%rd129+256];
mul.f64 %fd213, %fd1, %fd212;
mul.f64 %fd214, %fd213, %fd211;
mov.b64 %rd131, %fd214;
add.s32 %r326, %r9, 8;
shr.s32 %r327, %r326, 31;
shr.u32 %r328, %r327, 25;
add.s32 %r329, %r326, %r328;
and.b32 %r330, %r329, -128;
sub.s32 %r331, %r326, %r330;
mul.wide.s32 %rd132, %r331, 8;
add.s64 %rd133, %rd126, %rd132;
ld.shared.f64 %fd215, [%rd133];
sub.f64 %fd216, %fd194, %fd215;
ld.shared.f64 %fd217, [%rd133+1024];
mul.f64 %fd218, %fd217, %fd185;
sub.f64 %fd219, %fd216, %fd218;
ld.shared.f64 %fd220, [%rd129+64];
mul.f64 %fd221, %fd1, %fd220;
mul.f64 %fd222, %fd221, %fd219;
mov.b64 %rd134, %fd222;
sub.f64 %fd223, %fd195, %fd215;
mul.f64 %fd224, %fd217, %fd186;
sub.f64 %fd225, %fd223, %fd224;
ld.shared.f64 %fd226, [%rd129+320];
mul.f64 %fd227, %fd1, %fd226;
mul.f64 %fd228, %fd227, %fd225;
mov.b64 %rd135, %fd228;
add.s32 %r332, %r9, 16;
shr.s32 %r333, %r332, 31;
shr.u32 %r334, %r333, 25;
add.s32 %r335, %r332, %r334;
and.b32 %r336, %r335, -128;
sub.s32 %r337, %r332, %r336;
mul.wide.s32 %rd136, %r337, 8;
add.s64 %rd137, %rd126, %rd136;
ld.shared.f64 %fd229, [%rd137];
sub.f64 %fd230, %fd196, %fd229;
ld.shared.f64 %fd231, [%rd137+1024];
mul.f64 %fd232, %fd231, %fd187;
sub.f64 %fd233, %fd230, %fd232;
ld.shared.f64 %fd234, [%rd129+128];
mul.f64 %fd235, %fd1, %fd234;
mul.f64 %fd236, %fd235, %fd233;
mov.b64 %rd138, %fd236;
sub.f64 %fd237, %fd197, %fd229;
mul.f64 %fd238, %fd231, %fd188;
sub.f64 %fd239, %fd237, %fd238;
ld.shared.f64 %fd240, [%rd129+384];
mul.f64 %fd241, %fd1, %fd240;
mul.f64 %fd242, %fd241, %fd239;
mov.b64 %rd139, %fd242;
shr.s32 %r338, %r10, 31;
shr.u32 %r339, %r338, 25;
add.s32 %r340, %r10, %r339;
and.b32 %r341, %r340, -128;
sub.s32 %r342, %r10, %r341;
mul.wide.s32 %rd140, %r342, 8;
add.s64 %rd141, %rd126, %rd140;
ld.shared.f64 %fd243, [%rd141];
sub.f64 %fd244, %fd198, %fd243;
ld.shared.f64 %fd245, [%rd141+1024];
mul.f64 %fd246, %fd245, %fd189;
sub.f64 %fd247, %fd244, %fd246;
ld.shared.f64 %fd248, [%rd129+192];
mul.f64 %fd249, %fd1, %fd248;
mul.f64 %fd250, %fd249, %fd247;
mov.b64 %rd142, %fd250;
sub.f64 %fd251, %fd199, %fd243;
mul.f64 %fd252, %fd245, %fd190;
sub.f64 %fd253, %fd251, %fd252;
ld.shared.f64 %fd254, [%rd129+448];
mul.f64 %fd255, %fd1, %fd254;
mul.f64 %fd256, %fd255, %fd253;
mov.b64 %rd143, %fd256;
shl.b32 %r343, %r4, 9;
add.s32 %r344, %r310, %r343;
mad.lo.s32 %r345, %r344, %r57, %r308;
add.s32 %r346, %r345, %r11;
mul.wide.s32 %rd144, %r346, 8;
add.s64 %rd97, %rd17, %rd144;
mov.b64 {%r287, %r288}, %rd130;
mov.b64 {%r289, %r290}, %rd131;
// begin inline asm
st.global.cs.v4.s32 [%rd97], {%r287,%r288,%r289,%r290};
// end inline asm
mov.b64 {%r291, %r292}, %rd134;
mov.b64 {%r293, %r294}, %rd135;
add.s64 %rd98, %rd97, %rd104;
// begin inline asm
st.global.cs.v4.s32 [%rd98], {%r291,%r292,%r293,%r294};
// end inline asm
mov.b64 {%r295, %r296}, %rd138;
mov.b64 {%r297, %r298}, %rd139;
add.s64 %rd99, %rd98, %rd104;
// begin inline asm
st.global.cs.v4.s32 [%rd99], {%r295,%r296,%r297,%r298};
// end inline asm
mov.b64 {%r299, %r300}, %rd142;
mov.b64 {%r301, %r302}, %rd143;
add.s64 %rd100, %rd99, %rd104;
// begin inline asm
st.global.cs.v4.s32 [%rd100], {%r299,%r300,%r301,%r302};
// end inline asm
bra.uni $L__BB0_99;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f64 %fd107, [%rd8];
st.shared.f64 [%rd6+9216], %fd107;
$L__BB0_9:
mov.u32 %r13, %ctaid.x;
add.s32 %r82, %r57, 31;
shr.s32 %r83, %r82, 31;
shr.u32 %r84, %r83, 27;
add.s32 %r85, %r82, %r84;
shr.s32 %r14, %r85, 5;
shl.b32 %r15, %r4, 4;
shr.s32 %r86, %r2, 31;
shr.u32 %r87, %r86, 28;
add.s32 %r88, %r2, %r87;
and.b32 %r89, %r88, -16;
sub.s32 %r16, %r2, %r89;
shl.b32 %r90, %r16, 1;
rem.s32 %r91, %r13, %r14;
shl.b32 %r92, %r91, 5;
add.s32 %r20, %r92, %r90;
or.b32 %r17, %r20, 1;
setp.ge.s32 %p8, %r17, %r57;
shr.s32 %r18, %r88, 4;
add.s32 %r19, %r18, -216;
mov.f64 %fd273, 0d0000000000000000;
mov.f64 %fd275, 0d0000000000000000;
mov.f64 %fd274, %fd275;
@%p8 bra $L__BB0_12;
div.s32 %r93, %r13, %r14;
shl.b32 %r21, %r93, 5;
add.s32 %r94, %r19, %r21;
neg.s32 %r95, %r15;
setp.ge.s32 %p9, %r94, %r95;
@%p9 bra $L__BB0_12;
add.s32 %r100, %r15, %r18;
add.s32 %r101, %r100, %r21;
mad.lo.s32 %r102, %r101, %r57, %r20;
mul.wide.s32 %rd25, %r102, 8;
add.s64 %rd24, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r96,%r97,%r98,%r99}, [%rd24];
// end inline asm
mov.b64 %rd26, {%r96, %r97};
mov.b64 %fd274, %rd26;
mov.b64 %rd27, {%r98, %r99};
mov.b64 %fd273, %rd27;
$L__BB0_12:
mov.f64 %fd276, %fd275;
@%p8 bra $L__BB0_15;
div.s32 %r103, %r13, %r14;
shl.b32 %r22, %r103, 5;
add.s32 %r104, %r19, %r22;
mov.u32 %r105, -8;
sub.s32 %r106, %r105, %r15;
setp.ge.s32 %p11, %r104, %r106;
mov.f64 %fd276, %fd275;
@%p11 bra $L__BB0_15;
add.s32 %r111, %r15, %r18;
add.s32 %r112, %r111, %r22;
add.s32 %r113, %r112, 8;
mad.lo.s32 %r114, %r113, %r57, %r20;
mul.wide.s32 %rd29, %r114, 8;
add.s64 %rd28, %rd4, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r107,%r108,%r109,%r110}, [%rd28];
// end inline asm
mov.b64 %rd30, {%r107, %r108};
mov.b64 %fd276, %rd30;
mov.b64 %rd31, {%r109, %r110};
mov.b64 %fd275, %rd31;
$L__BB0_15:
mov.f64 %fd277, 0d0000000000000000;
mov.f64 %fd279, 0d0000000000000000;
mov.f64 %fd278, %fd279;
@%p8 bra $L__BB0_18;
div.s32 %r115, %r13, %r14;
shl.b32 %r23, %r115, 5;
add.s32 %r116, %r19, %r23;
mov.u32 %r117, -16;
sub.s32 %r118, %r117, %r15;
setp.ge.s32 %p13, %r116, %r118;
@%p13 bra $L__BB0_18;
add.s32 %r123, %r15, %r18;
add.s32 %r124, %r123, %r23;
add.s32 %r125, %r124, 16;
mad.lo.s32 %r126, %r125, %r57, %r20;
mul.wide.s32 %rd33, %r126, 8;
add.s64 %rd32, %rd4, %rd33;
// begin inline asm
ld.global.cs.v4.u32 {%r119,%r120,%r121,%r122}, [%rd32];
// end inline asm
mov.b64 %rd34, {%r119, %r120};
mov.b64 %fd278, %rd34;
mov.b64 %rd35, {%r121, %r122};
mov.b64 %fd277, %rd35;
$L__BB0_18:
mov.f64 %fd280, %fd279;
@%p8 bra $L__BB0_21;
div.s32 %r127, %r13, %r14;
shl.b32 %r24, %r127, 5;
add.s32 %r128, %r19, %r24;
mov.u32 %r129, -24;
sub.s32 %r130, %r129, %r15;
setp.ge.s32 %p15, %r128, %r130;
mov.f64 %fd280, %fd279;
@%p15 bra $L__BB0_21;
add.s32 %r135, %r15, %r18;
add.s32 %r136, %r135, %r24;
add.s32 %r137, %r136, 24;
mad.lo.s32 %r138, %r137, %r57, %r20;
mul.wide.s32 %rd37, %r138, 8;
add.s64 %rd36, %rd4, %rd37;
// begin inline asm
ld.global.cs.v4.u32 {%r131,%r132,%r133,%r134}, [%rd36];
// end inline asm
mov.b64 %rd38, {%r131, %r132};
mov.b64 %fd280, %rd38;
mov.b64 %rd39, {%r133, %r134};
mov.b64 %fd279, %rd39;
$L__BB0_21:
shl.b32 %r25, %r4, 6;
setp.gt.s32 %p16, %r2, 31;
@%p16 bra $L__BB0_24;
div.s32 %r139, %r13, %r14;
shl.b32 %r140, %r139, 5;
add.s32 %r26, %r140, %r2;
setp.gt.s32 %p17, %r26, 215;
@%p17 bra $L__BB0_24;
mul.wide.s32 %rd40, %r26, 8;
add.s64 %rd41, %rd2, %rd40;
ld.global.f64 %fd124, [%rd41];
st.shared.f64 [%rd6+8192], %fd124;
$L__BB0_24:
mov.f64 %fd281, 0d0000000000000000;
mov.f64 %fd283, 0d0000000000000000;
mov.f64 %fd282, %fd283;
@%p8 bra $L__BB0_27;
div.s32 %r141, %r13, %r14;
shl.b32 %r27, %r141, 5;
add.s32 %r142, %r19, %r27;
neg.s32 %r143, %r25;
setp.ge.s32 %p19, %r142, %r143;
@%p19 bra $L__BB0_27;
add.s32 %r148, %r25, %r18;
add.s32 %r149, %r148, %r27;
mad.lo.s32 %r150, %r149, %r57, %r20;
mul.wide.s32 %rd43, %r150, 8;
add.s64 %rd42, %rd3, %rd43;
// begin inline asm
ld.global.cs.v4.u32 {%r144,%r145,%r146,%r147}, [%rd42];
// end inline asm
mov.b64 %rd44, {%r144, %r145};
mov.b64 %fd282, %rd44;
mov.b64 %rd45, {%r146, %r147};
mov.b64 %fd281, %rd45;
$L__BB0_27:
mov.f64 %fd284, %fd283;
@%p8 bra $L__BB0_30;
div.s32 %r151, %r13, %r14;
shl.b32 %r28, %r151, 5;
add.s32 %r152, %r19, %r28;
mov.u32 %r153, -8;
sub.s32 %r154, %r153, %r25;
setp.ge.s32 %p21, %r152, %r154;
mov.f64 %fd284, %fd283;
@%p21 bra $L__BB0_30;
add.s32 %r159, %r25, %r18;
add.s32 %r160, %r159, %r28;
add.s32 %r161, %r160, 8;
mad.lo.s32 %r162, %r161, %r57, %r20;
mul.wide.s32 %rd47, %r162, 8;
add.s64 %rd46, %rd3, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd46];
// end inline asm
mov.b64 %rd48, {%r155, %r156};
mov.b64 %fd284, %rd48;
mov.b64 %rd49, {%r157, %r158};
mov.b64 %fd283, %rd49;
$L__BB0_30:
mov.f64 %fd285, 0d0000000000000000;
mov.f64 %fd287, 0d0000000000000000;
mov.f64 %fd286, %fd287;
@%p8 bra $L__BB0_33;
div.s32 %r163, %r13, %r14;
shl.b32 %r29, %r163, 5;
add.s32 %r164, %r19, %r29;
mov.u32 %r165, -16;
sub.s32 %r166, %r165, %r25;
setp.ge.s32 %p23, %r164, %r166;
@%p23 bra $L__BB0_33;
add.s32 %r171, %r25, %r18;
add.s32 %r172, %r171, %r29;
add.s32 %r173, %r172, 16;
mad.lo.s32 %r174, %r173, %r57, %r20;
mul.wide.s32 %rd51, %r174, 8;
add.s64 %rd50, %rd3, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r167,%r168,%r169,%r170}, [%rd50];
// end inline asm
mov.b64 %rd52, {%r167, %r168};
mov.b64 %fd286, %rd52;
mov.b64 %rd53, {%r169, %r170};
mov.b64 %fd285, %rd53;
$L__BB0_33:
setp.lt.s32 %p24, %r17, %r57;
@%p24 bra $L__BB0_35;
bra.uni $L__BB0_34;
$L__BB0_35:
div.s32 %r175, %r13, %r14;
shl.b32 %r30, %r175, 5;
add.s32 %r176, %r19, %r30;
mov.u32 %r177, -24;
sub.s32 %r178, %r177, %r25;
setp.ge.s32 %p25, %r176, %r178;
mov.f64 %fd288, %fd287;
@%p25 bra $L__BB0_37;
add.s32 %r183, %r25, %r18;
add.s32 %r184, %r183, %r30;
add.s32 %r185, %r184, 24;
mad.lo.s32 %r186, %r185, %r57, %r20;
mul.wide.s32 %rd55, %r186, 8;
add.s64 %rd54, %rd3, %rd55;
// begin inline asm
ld.global.cs.v4.u32 {%r179,%r180,%r181,%r182}, [%rd54];
// end inline asm
mov.b64 %rd56, {%r179, %r180};
mov.b64 %fd288, %rd56;
mov.b64 %rd57, {%r181, %r182};
mov.b64 %fd287, %rd57;
bra.uni $L__BB0_37;
$L__BB0_34:
mov.f64 %fd288, %fd287;
$L__BB0_37:
div.s32 %r187, %r13, %r14;
shl.b32 %r31, %r187, 5;
shr.u32 %r189, %r86, 27;
add.s32 %r190, %r2, %r189;
and.b32 %r191, %r190, -32;
sub.s32 %r192, %r2, %r191;
add.s32 %r193, %r31, %r192;
setp.gt.s32 %p26, %r193, 215;
mul.lo.s32 %r194, %r193, %r54;
mul.wide.s32 %rd58, %r194, 8;
add.s64 %rd9, %rd1, %rd58;
@%p26 bra $L__BB0_39;
ld.global.f64 %fd141, [%rd9];
st.shared.f64 [%rd6], %fd141;
st.shared.f64 [%rd6+1024], %fd141;
st.shared.f64 [%rd6+2048], %fd141;
st.shared.f64 [%rd6+3072], %fd141;
st.shared.f64 [%rd6+4096], %fd141;
st.shared.f64 [%rd6+5120], %fd141;
st.shared.f64 [%rd6+6144], %fd141;
st.shared.f64 [%rd6+7168], %fd141;
$L__BB0_39:
shl.b32 %r32, %r4, 8;
barrier.sync 0;
neg.s32 %r33, %r32;
add.s32 %r34, %r19, %r31;
setp.ge.s32 %p27, %r34, %r33;
shr.s32 %r195, %r18, 31;
shr.u32 %r196, %r195, 25;
add.s32 %r197, %r18, %r196;
and.b32 %r198, %r197, -128;
sub.s32 %r199, %r18, %r198;
mul.wide.s32 %rd59, %r199, 8;
add.s64 %rd61, %rd21, %rd59;
add.s64 %rd10, %rd61, 8192;
mov.f64 %fd290, 0d0000000000000000;
mov.f64 %fd289, %fd290;
@%p27 bra $L__BB0_41;
ld.shared.f64 %fd289, [%rd10];
$L__BB0_41:
sub.f64 %fd36, %fd282, %fd289;
@%p27 bra $L__BB0_43;
ld.shared.f64 %fd290, [%rd10+1024];
$L__BB0_43:
mul.f64 %fd145, %fd290, %fd274;
sub.f64 %fd39, %fd36, %fd145;
shl.b32 %r200, %r16, 6;
add.s32 %r201, %r200, %r18;
mul.wide.s32 %rd62, %r201, 8;
add.s64 %rd11, %rd21, %rd62;
mov.f64 %fd292, 0d0000000000000000;
mov.f64 %fd291, %fd292;
@%p27 bra $L__BB0_45;
ld.shared.f64 %fd146, [%rd11];
mul.f64 %fd291, %fd1, %fd146;
$L__BB0_45:
mul.f64 %fd42, %fd39, %fd291;
@%p27 bra $L__BB0_47;
ld.shared.f64 %fd292, [%rd10];
$L__BB0_47:
sub.f64 %fd45, %fd281, %fd292;
mov.f64 %fd294, 0d0000000000000000;
mov.f64 %fd293, %fd294;
@%p27 bra $L__BB0_49;
ld.shared.f64 %fd293, [%rd10+1024];
$L__BB0_49:
mul.f64 %fd150, %fd293, %fd273;
sub.f64 %fd48, %fd45, %fd150;
@%p27 bra $L__BB0_51;
ld.shared.f64 %fd151, [%rd11+256];
mul.f64 %fd294, %fd1, %fd151;
$L__BB0_51:
mul.f64 %fd51, %fd48, %fd294;
mov.u32 %r202, -8;
sub.s32 %r35, %r202, %r32;
setp.ge.s32 %p33, %r34, %r35;
add.s32 %r36, %r18, 8;
shr.s32 %r203, %r36, 31;
shr.u32 %r204, %r203, 25;
add.s32 %r205, %r36, %r204;
and.b32 %r206, %r205, -128;
sub.s32 %r207, %r36, %r206;
mul.wide.s32 %rd64, %r207, 8;
add.s64 %rd66, %rd21, %rd64;
add.s64 %rd12, %rd66, 8192;
mov.f64 %fd296, 0d0000000000000000;
mov.f64 %fd295, %fd296;
@%p33 bra $L__BB0_53;
ld.shared.f64 %fd295, [%rd12];
$L__BB0_53:
sub.f64 %fd54, %fd284, %fd295;
@%p33 bra $L__BB0_55;
ld.shared.f64 %fd296, [%rd12+1024];
$L__BB0_55:
mul.f64 %fd155, %fd296, %fd276;
sub.f64 %fd57, %fd54, %fd155;
mov.f64 %fd298, 0d0000000000000000;
mov.f64 %fd297, %fd298;
@%p33 bra $L__BB0_57;
ld.shared.f64 %fd156, [%rd11+64];
mul.f64 %fd297, %fd1, %fd156;
$L__BB0_57:
mul.f64 %fd60, %fd57, %fd297;
@%p33 bra $L__BB0_59;
ld.shared.f64 %fd298, [%rd12];
$L__BB0_59:
sub.f64 %fd63, %fd283, %fd298;
mov.f64 %fd300, 0d0000000000000000;
mov.f64 %fd299, %fd300;
@%p33 bra $L__BB0_61;
ld.shared.f64 %fd299, [%rd12+1024];
$L__BB0_61:
mul.f64 %fd160, %fd299, %fd275;
sub.f64 %fd66, %fd63, %fd160;
@%p33 bra $L__BB0_63;
ld.shared.f64 %fd161, [%rd11+320];
mul.f64 %fd300, %fd1, %fd161;
$L__BB0_63:
mul.f64 %fd69, %fd66, %fd300;
mov.u32 %r208, -16;
sub.s32 %r37, %r208, %r32;
setp.ge.s32 %p39, %r34, %r37;
add.s32 %r38, %r18, 16;
shr.s32 %r209, %r38, 31;
shr.u32 %r210, %r209, 25;
add.s32 %r211, %r38, %r210;
and.b32 %r212, %r211, -128;
sub.s32 %r213, %r38, %r212;
mul.wide.s32 %rd67, %r213, 8;
add.s64 %rd69, %rd21, %rd67;
add.s64 %rd13, %rd69, 8192;
mov.f64 %fd302, 0d0000000000000000;
mov.f64 %fd301, %fd302;
@%p39 bra $L__BB0_65;
ld.shared.f64 %fd301, [%rd13];
$L__BB0_65:
sub.f64 %fd72, %fd286, %fd301;
@%p39 bra $L__BB0_67;
ld.shared.f64 %fd302, [%rd13+1024];
$L__BB0_67:
mul.f64 %fd165, %fd302, %fd278;
sub.f64 %fd75, %fd72, %fd165;
mov.f64 %fd304, 0d0000000000000000;
mov.f64 %fd303, %fd304;
@%p39 bra $L__BB0_69;
ld.shared.f64 %fd166, [%rd11+128];
mul.f64 %fd303, %fd1, %fd166;
$L__BB0_69:
mul.f64 %fd78, %fd75, %fd303;
@%p39 bra $L__BB0_71;
ld.shared.f64 %fd304, [%rd13];
$L__BB0_71:
sub.f64 %fd81, %fd285, %fd304;
mov.f64 %fd306, 0d0000000000000000;
mov.f64 %fd305, %fd306;
@%p39 bra $L__BB0_73;
ld.shared.f64 %fd305, [%rd13+1024];
$L__BB0_73:
mul.f64 %fd170, %fd305, %fd277;
sub.f64 %fd84, %fd81, %fd170;
@%p39 bra $L__BB0_75;
ld.shared.f64 %fd171, [%rd11+384];
mul.f64 %fd306, %fd1, %fd171;
$L__BB0_75:
mul.f64 %fd87, %fd84, %fd306;
mov.u32 %r214, -24;
sub.s32 %r39, %r214, %r32;
setp.ge.s32 %p45, %r34, %r39;
add.s32 %r40, %r18, 24;
shr.s32 %r215, %r40, 31;
shr.u32 %r216, %r215, 25;
add.s32 %r217, %r40, %r216;
and.b32 %r218, %r217, -128;
sub.s32 %r219, %r40, %r218;
mul.wide.s32 %rd70, %r219, 8;
add.s64 %rd72, %rd21, %rd70;
add.s64 %rd14, %rd72, 8192;
mov.f64 %fd308, 0d0000000000000000;
mov.f64 %fd307, %fd308;
@%p45 bra $L__BB0_77;
ld.shared.f64 %fd307, [%rd14];
$L__BB0_77:
sub.f64 %fd90, %fd288, %fd307;
@%p45 bra $L__BB0_79;
ld.shared.f64 %fd308, [%rd14+1024];
$L__BB0_79:
mul.f64 %fd175, %fd308, %fd280;
sub.f64 %fd93, %fd90, %fd175;
mov.f64 %fd310, 0d0000000000000000;
mov.f64 %fd309, %fd310;
@%p45 bra $L__BB0_81;
ld.shared.f64 %fd176, [%rd11+192];
mul.f64 %fd309, %fd1, %fd176;
$L__BB0_81:
mul.f64 %fd96, %fd93, %fd309;
@%p45 bra $L__BB0_83;
ld.shared.f64 %fd310, [%rd14];
$L__BB0_83:
sub.f64 %fd99, %fd287, %fd310;
mov.f64 %fd312, 0d0000000000000000;
mov.f64 %fd311, %fd312;
@%p45 bra $L__BB0_85;
ld.shared.f64 %fd311, [%rd14+1024];
$L__BB0_85:
mul.f64 %fd180, %fd311, %fd279;
sub.f64 %fd102, %fd99, %fd180;
@%p45 bra $L__BB0_87;
ld.shared.f64 %fd181, [%rd11+448];
mul.f64 %fd312, %fd1, %fd181;
$L__BB0_87:
mul.f64 %fd105, %fd102, %fd312;
shl.b32 %r41, %r4, 9;
@%p8 bra $L__BB0_90;
neg.s32 %r220, %r41;
setp.ge.s32 %p52, %r34, %r220;
@%p52 bra $L__BB0_90;
add.s32 %r225, %r41, %r18;
add.s32 %r226, %r225, %r31;
mad.lo.s32 %r227, %r226, %r57, %r20;
mul.wide.s32 %rd74, %r227, 8;
add.s64 %rd73, %rd17, %rd74;
mov.b64 %rd75, %fd42;
mov.b64 {%r221, %r222}, %rd75;
mov.b64 %rd76, %fd51;
mov.b64 {%r223, %r224}, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd73], {%r221,%r222,%r223,%r224};
// end inline asm
$L__BB0_90:
mov.u32 %r228, -8;
sub.s32 %r229, %r228, %r41;
setp.ge.s32 %p54, %r34, %r229;
or.pred %p55, %p8, %p54;
@%p55 bra $L__BB0_92;
add.s32 %r234, %r36, %r41;
add.s32 %r235, %r234, %r31;
mad.lo.s32 %r236, %r235, %r57, %r20;
mul.wide.s32 %rd78, %r236, 8;
add.s64 %rd77, %rd17, %rd78;
mov.b64 %rd79, %fd60;
mov.b64 {%r230, %r231}, %rd79;
mov.b64 %rd80, %fd69;
mov.b64 {%r232, %r233}, %rd80;
// begin inline asm
st.global.cs.v4.s32 [%rd77], {%r230,%r231,%r232,%r233};
// end inline asm
$L__BB0_92:
@%p8 bra $L__BB0_95;
mov.u32 %r237, -16;
sub.s32 %r238, %r237, %r41;
setp.ge.s32 %p57, %r34, %r238;
@%p57 bra $L__BB0_95;
add.s32 %r243, %r38, %r41;
add.s32 %r244, %r243, %r31;
mad.lo.s32 %r245, %r244, %r57, %r20;
mul.wide.s32 %rd82, %r245, 8;
add.s64 %rd81, %rd17, %rd82;
mov.b64 %rd83, %fd78;
mov.b64 {%r239, %r240}, %rd83;
mov.b64 %rd84, %fd87;
mov.b64 {%r241, %r242}, %rd84;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r239,%r240,%r241,%r242};
// end inline asm
$L__BB0_95:
@%p8 bra $L__BB0_99;
mov.u32 %r246, -24;
sub.s32 %r247, %r246, %r41;
setp.ge.s32 %p59, %r34, %r247;
@%p59 bra $L__BB0_99;
add.s32 %r252, %r40, %r41;
add.s32 %r253, %r252, %r31;
mad.lo.s32 %r254, %r253, %r57, %r20;
mul.wide.s32 %rd86, %r254, 8;
add.s64 %rd85, %rd17, %rd86;
mov.b64 %rd87, %fd96;
mov.b64 {%r248, %r249}, %rd87;
mov.b64 %rd88, %fd105;
mov.b64 {%r250, %r251}, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r248,%r249,%r250,%r251};
// end inline asm
$L__BB0_99:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -24,839 +24,923 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4[16],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5[24]
)
{
.reg .pred %p<60>;
- .reg .b32 %r<302>;
- .reg .f64 %fd<305>;
- .reg .b64 %rd<125>;
+ .reg .b32 %r<347>;
+ .reg .f64 %fd<313>;
+ .reg .b64 %rd<145>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
- ld.param.v2.u32 {%r52, %r53}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0+16];
- ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3+8];
- ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5];
- ld.param.u64 %rd12, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4];
- ld.param.u64 %rd11, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0];
+ ld.param.v2.u32 {%r54, %r55}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0+16];
+ ld.param.v2.u32 {%r56, %r57}, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3+8];
+ ld.param.u64 %rd17, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_5];
+ ld.param.u64 %rd16, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_4];
+ ld.param.u64 %rd15, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_0];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_2];
- ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_1];
- cvta.to.global.u64 %rd1, %rd11;
- cvta.to.global.u64 %rd2, %rd14;
+ ld.param.u64 %rd18, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_1];
+ cvta.to.global.u64 %rd1, %rd15;
+ cvta.to.global.u64 %rd2, %rd18;
ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1__param_3];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
- mov.u32 %r62, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r62;
+ mov.u32 %r64, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r64;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd15, _ZZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r63, [%rd15], %r2;
+ mov.u64 %rd19, _ZZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
+ atom.shared.min.s32 %r65, [%rd19], %r2;
ld.shared.u32 %r4, [_ZZN11kernelscope6kernelENS_6TensorIdLi2ELi2EEENS0_IdLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s];
- cvt.rn.f64.s32 %fd106, %r55;
+ cvt.rn.f64.s32 %fd106, %r57;
rcp.rn.f64 %fd1, %fd106;
- mul.wide.s32 %rd16, %r2, 8;
- mov.u64 %rd17, _ZN11kernelscope6kernelE;
- add.s64 %rd6, %rd17, %rd16;
+ mul.wide.s32 %rd20, %r2, 8;
+ mov.u64 %rd21, _ZN11kernelscope6kernelE;
+ add.s64 %rd6, %rd21, %rd20;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
- add.s32 %r64, %r55, 31;
- shr.s32 %r65, %r64, 31;
- shr.u32 %r66, %r65, 27;
- add.s32 %r67, %r64, %r66;
- shr.s32 %r6, %r67, 5;
- div.s32 %r68, %r5, %r6;
- shl.b32 %r7, %r68, 5;
+ add.s32 %r66, %r57, 31;
+ shr.s32 %r67, %r66, 31;
+ shr.u32 %r68, %r67, 27;
+ add.s32 %r69, %r66, %r68;
+ shr.s32 %r6, %r69, 5;
+ div.s32 %r70, %r5, %r6;
+ shl.b32 %r7, %r70, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
- cvta.to.global.u64 %rd18, %rd12;
- mul.wide.s32 %rd19, %r8, 8;
- add.s64 %rd8, %rd18, %rd19;
+ cvta.to.global.u64 %rd22, %rd16;
+ mul.wide.s32 %rd23, %r8, 8;
+ add.s64 %rd8, %rd22, %rd23;
@%p3 bra $L__BB0_7;
- shr.s32 %r69, %r2, 31;
- shr.u32 %r70, %r69, 28;
- add.s32 %r71, %r2, %r70;
- shr.s32 %r9, %r71, 4;
- add.s32 %r72, %r9, %r7;
- add.s32 %r73, %r72, 24;
- setp.gt.s32 %p4, %r73, 215;
+ shr.s32 %r71, %r2, 31;
+ shr.u32 %r72, %r71, 28;
+ add.s32 %r73, %r2, %r72;
+ shr.s32 %r9, %r73, 4;
+ add.s32 %r10, %r9, 24;
+ add.s32 %r74, %r10, %r7;
+ setp.gt.s32 %p4, %r74, 215;
@%p4 bra $L__BB0_7;
- and.b32 %r77, %r71, 2147483632;
- sub.s32 %r78, %r2, %r77;
- shl.b32 %r10, %r78, 1;
- rem.s32 %r79, %r5, %r6;
- shl.b32 %r11, %r79, 5;
- add.s32 %r80, %r10, %r11;
- or.b32 %r81, %r80, 1;
- setp.ge.s32 %p5, %r81, %r55;
+ rem.s32 %r75, %r5, %r6;
+ shl.b32 %r11, %r75, 5;
+ or.b32 %r76, %r11, 31;
+ setp.ge.s32 %p5, %r76, %r57;
@%p5 bra $L__BB0_7;
- shr.u32 %r83, %r69, 27;
- add.s32 %r84, %r2, %r83;
- and.b32 %r85, %r84, -32;
- sub.s32 %r12, %r2, %r85;
- add.s32 %r13, %r7, %r12;
- setp.lt.s32 %p6, %r13, 216;
+ shr.u32 %r78, %r71, 27;
+ add.s32 %r79, %r2, %r78;
+ and.b32 %r80, %r79, -32;
+ sub.s32 %r81, %r2, %r80;
+ add.s32 %r12, %r7, %r81;
+ setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_98;
bra.uni $L__BB0_7;
$L__BB0_98:
ld.global.f64 %fd182, [%rd8];
st.shared.f64 [%rd6+9216], %fd182;
- shl.b32 %r288, %r4, 4;
- add.s32 %r289, %r7, %r9;
- add.s32 %r290, %r289, %r288;
- mad.lo.s32 %r291, %r290, %r55, %r10;
- add.s32 %r292, %r291, %r11;
- mul.wide.s32 %rd88, %r292, 8;
- add.s64 %rd76, %rd4, %rd88;
-
- ld.global.cs.v4.u32 {%r240,%r241,%r242,%r243}, [%rd76];
-
- mov.b64 %rd89, {%r240, %r241};
- mov.b64 %fd183, %rd89;
- mov.b64 %rd90, {%r242, %r243};
- mov.b64 %fd184, %rd90;
- shl.b32 %r293, %r55, 3;
- mul.wide.s32 %rd91, %r293, 8;
- add.s64 %rd77, %rd76, %rd91;
-
- ld.global.cs.v4.u32 {%r244,%r245,%r246,%r247}, [%rd77];
-
- mov.b64 %rd92, {%r244, %r245};
- mov.b64 %fd185, %rd92;
- mov.b64 %rd93, {%r246, %r247};
- mov.b64 %fd186, %rd93;
- add.s64 %rd78, %rd77, %rd91;
-
- ld.global.cs.v4.u32 {%r248,%r249,%r250,%r251}, [%rd78];
-
- mov.b64 %rd94, {%r248, %r249};
- mov.b64 %fd187, %rd94;
- mov.b64 %rd95, {%r250, %r251};
- mov.b64 %fd188, %rd95;
- add.s64 %rd79, %rd78, %rd91;
-
- ld.global.cs.v4.u32 {%r252,%r253,%r254,%r255}, [%rd79];
-
- mov.b64 %rd96, {%r252, %r253};
- mov.b64 %fd189, %rd96;
- mov.b64 %rd97, {%r254, %r255};
- mov.b64 %fd190, %rd97;
- shl.b64 %rd98, %rd7, 3;
- add.s64 %rd99, %rd2, %rd98;
- ld.global.f64 %fd191, [%rd99];
+ and.b32 %r306, %r73, -16;
+ sub.s32 %r307, %r2, %r306;
+ shl.b32 %r308, %r307, 1;
+ shl.b32 %r309, %r4, 4;
+ add.s32 %r310, %r7, %r9;
+ add.s32 %r311, %r310, %r309;
+ mad.lo.s32 %r312, %r311, %r57, %r308;
+ add.s32 %r313, %r312, %r11;
+ mul.wide.s32 %rd101, %r313, 8;
+ add.s64 %rd89, %rd4, %rd101;
+
+ ld.global.cs.v4.u32 {%r255,%r256,%r257,%r258}, [%rd89];
+
+ mov.b64 %rd102, {%r255, %r256};
+ mov.b64 %fd183, %rd102;
+ mov.b64 %rd103, {%r257, %r258};
+ mov.b64 %fd184, %rd103;
+ shl.b32 %r314, %r57, 3;
+ mul.wide.s32 %rd104, %r314, 8;
+ add.s64 %rd90, %rd89, %rd104;
+
+ ld.global.cs.v4.u32 {%r259,%r260,%r261,%r262}, [%rd90];
+
+ mov.b64 %rd105, {%r259, %r260};
+ mov.b64 %fd185, %rd105;
+ mov.b64 %rd106, {%r261, %r262};
+ mov.b64 %fd186, %rd106;
+ add.s64 %rd91, %rd90, %rd104;
+
+ ld.global.cs.v4.u32 {%r263,%r264,%r265,%r266}, [%rd91];
+
+ mov.b64 %rd107, {%r263, %r264};
+ mov.b64 %fd187, %rd107;
+ mov.b64 %rd108, {%r265, %r266};
+ mov.b64 %fd188, %rd108;
+ add.s64 %rd92, %rd91, %rd104;
+
+ ld.global.cs.v4.u32 {%r267,%r268,%r269,%r270}, [%rd92];
+
+ mov.b64 %rd109, {%r267, %r268};
+ mov.b64 %fd189, %rd109;
+ mov.b64 %rd110, {%r269, %r270};
+ mov.b64 %fd190, %rd110;
+ shl.b64 %rd111, %rd7, 3;
+ add.s64 %rd112, %rd2, %rd111;
+ ld.global.f64 %fd191, [%rd112];
st.shared.f64 [%rd6+8192], %fd191;
- mad.lo.s32 %r294, %r4, 48, %r290;
- mad.lo.s32 %r295, %r294, %r55, %r10;
- add.s32 %r296, %r295, %r11;
- mul.wide.s32 %rd100, %r296, 8;
- add.s64 %rd80, %rd3, %rd100;
-
- ld.global.cs.v4.u32 {%r256,%r257,%r258,%r259}, [%rd80];
-
- mov.b64 %rd101, {%r256, %r257};
- mov.b64 %fd192, %rd101;
- mov.b64 %rd102, {%r258, %r259};
- mov.b64 %fd193, %rd102;
- add.s64 %rd81, %rd80, %rd91;
-
- ld.global.cs.v4.u32 {%r260,%r261,%r262,%r263}, [%rd81];
-
- mov.b64 %rd103, {%r260, %r261};
- mov.b64 %fd194, %rd103;
- mov.b64 %rd104, {%r262, %r263};
- mov.b64 %fd195, %rd104;
- add.s64 %rd82, %rd81, %rd91;
-
- ld.global.cs.v4.u32 {%r264,%r265,%r266,%r267}, [%rd82];
-
- mov.b64 %rd105, {%r264, %r265};
- mov.b64 %fd196, %rd105;
- mov.b64 %rd106, {%r266, %r267};
- mov.b64 %fd197, %rd106;
- add.s64 %rd83, %rd82, %rd91;
-
- ld.global.cs.v4.u32 {%r268,%r269,%r270,%r271}, [%rd83];
-
- mov.b64 %rd107, {%r268, %r269};
- mov.b64 %fd198, %rd107;
- mov.b64 %rd108, {%r270, %r271};
- mov.b64 %fd199, %rd108;
- mul.lo.s32 %r297, %r13, %r52;
- mul.wide.s32 %rd109, %r297, 8;
- add.s64 %rd110, %rd1, %rd109;
- mul.wide.s32 %rd111, %r12, 8;
- add.s64 %rd113, %rd17, %rd111;
- ld.global.f64 %fd200, [%rd110];
- st.shared.f64 [%rd113], %fd200;
+ mad.lo.s32 %r315, %r4, 48, %r311;
+ mad.lo.s32 %r316, %r315, %r57, %r308;
+ add.s32 %r317, %r316, %r11;
+ mul.wide.s32 %rd113, %r317, 8;
+ add.s64 %rd93, %rd3, %rd113;
+
+ ld.global.cs.v4.u32 {%r271,%r272,%r273,%r274}, [%rd93];
+
+ mov.b64 %rd114, {%r271, %r272};
+ mov.b64 %fd192, %rd114;
+ mov.b64 %rd115, {%r273, %r274};
+ mov.b64 %fd193, %rd115;
+ add.s64 %rd94, %rd93, %rd104;
+
+ ld.global.cs.v4.u32 {%r275,%r276,%r277,%r278}, [%rd94];
+
+ mov.b64 %rd116, {%r275, %r276};
+ mov.b64 %fd194, %rd116;
+ mov.b64 %rd117, {%r277, %r278};
+ mov.b64 %fd195, %rd117;
+ add.s64 %rd95, %rd94, %rd104;
+
+ ld.global.cs.v4.u32 {%r279,%r280,%r281,%r282}, [%rd95];
+
+ mov.b64 %rd118, {%r279, %r280};
+ mov.b64 %fd196, %rd118;
+ mov.b64 %rd119, {%r281, %r282};
+ mov.b64 %fd197, %rd119;
+ add.s64 %rd96, %rd95, %rd104;
+
+ ld.global.cs.v4.u32 {%r283,%r284,%r285,%r286}, [%rd96];
+
+ mov.b64 %rd120, {%r283, %r284};
+ mov.b64 %fd198, %rd120;
+ mov.b64 %rd121, {%r285, %r286};
+ mov.b64 %fd199, %rd121;
+ mul.lo.s32 %r318, %r12, %r54;
+ mul.wide.s32 %rd122, %r318, 8;
+ add.s64 %rd123, %rd1, %rd122;
+ ld.global.f64 %fd200, [%rd123];
+ st.shared.f64 [%rd6], %fd200;
+ st.shared.f64 [%rd6+1024], %fd200;
+ st.shared.f64 [%rd6+2048], %fd200;
+ st.shared.f64 [%rd6+3072], %fd200;
+ st.shared.f64 [%rd6+4096], %fd200;
+ st.shared.f64 [%rd6+5120], %fd200;
+ st.shared.f64 [%rd6+6144], %fd200;
+ st.shared.f64 [%rd6+7168], %fd200;
barrier.sync 0;
- mul.wide.s32 %rd114, %r9, 8;
- add.s64 %rd115, %rd17, %rd114;
- ld.shared.f64 %fd201, [%rd115];
- mul.f64 %fd202, %fd1, %fd201;
- ld.shared.f64 %fd203, [%rd115+8192];
- sub.f64 %fd204, %fd192, %fd203;
- ld.shared.f64 %fd205, [%rd115+9216];
- mul.f64 %fd206, %fd205, %fd183;
- sub.f64 %fd207, %fd204, %fd206;
- mul.f64 %fd208, %fd202, %fd207;
- mov.b64 %rd116, %fd208;
- sub.f64 %fd209, %fd193, %fd203;
- mul.f64 %fd210, %fd205, %fd184;
+ shl.b32 %r319, %r307, 6;
+ add.s32 %r320, %r319, %r9;
+ shr.s32 %r321, %r9, 31;
+ shr.u32 %r322, %r321, 25;
+ add.s32 %r323, %r9, %r322;
+ and.b32 %r324, %r323, -128;
+ sub.s32 %r325, %r9, %r324;
+ mul.wide.s32 %rd124, %r325, 8;
+ add.s64 %rd126, %rd21, 8192;
+ add.s64 %rd127, %rd126, %rd124;
+ ld.shared.f64 %fd201, [%rd127];
+ sub.f64 %fd202, %fd192, %fd201;
+ ld.shared.f64 %fd203, [%rd127+1024];
+ mul.f64 %fd204, %fd203, %fd183;
+ sub.f64 %fd205, %fd202, %fd204;
+ mul.wide.s32 %rd128, %r320, 8;
+ add.s64 %rd129, %rd21, %rd128;
+ ld.shared.f64 %fd206, [%rd129];
+ mul.f64 %fd207, %fd1, %fd206;
+ mul.f64 %fd208, %fd207, %fd205;
+ mov.b64 %rd130, %fd208;
+ sub.f64 %fd209, %fd193, %fd201;
+ mul.f64 %fd210, %fd203, %fd184;
sub.f64 %fd211, %fd209, %fd210;
- mul.f64 %fd212, %fd202, %fd211;
- mov.b64 %rd117, %fd212;
- ld.shared.f64 %fd213, [%rd115+64];
- mul.f64 %fd214, %fd1, %fd213;
- ld.shared.f64 %fd215, [%rd115+8256];
+ ld.shared.f64 %fd212, [%rd129+256];
+ mul.f64 %fd213, %fd1, %fd212;
+ mul.f64 %fd214, %fd213, %fd211;
+ mov.b64 %rd131, %fd214;
+ add.s32 %r326, %r9, 8;
+ shr.s32 %r327, %r326, 31;
+ shr.u32 %r328, %r327, 25;
+ add.s32 %r329, %r326, %r328;
+ and.b32 %r330, %r329, -128;
+ sub.s32 %r331, %r326, %r330;
+ mul.wide.s32 %rd132, %r331, 8;
+ add.s64 %rd133, %rd126, %rd132;
+ ld.shared.f64 %fd215, [%rd133];
sub.f64 %fd216, %fd194, %fd215;
- ld.shared.f64 %fd217, [%rd115+9280];
+ ld.shared.f64 %fd217, [%rd133+1024];
mul.f64 %fd218, %fd217, %fd185;
sub.f64 %fd219, %fd216, %fd218;
- mul.f64 %fd220, %fd214, %fd219;
- mov.b64 %rd118, %fd220;
- sub.f64 %fd221, %fd195, %fd215;
- mul.f64 %fd222, %fd217, %fd186;
- sub.f64 %fd223, %fd221, %fd222;
- mul.f64 %fd224, %fd214, %fd223;
- mov.b64 %rd119, %fd224;
- ld.shared.f64 %fd225, [%rd115+128];
- mul.f64 %fd226, %fd1, %fd225;
- ld.shared.f64 %fd227, [%rd115+8320];
- sub.f64 %fd228, %fd196, %fd227;
- ld.shared.f64 %fd229, [%rd115+9344];
- mul.f64 %fd230, %fd229, %fd187;
- sub.f64 %fd231, %fd228, %fd230;
- mul.f64 %fd232, %fd226, %fd231;
- mov.b64 %rd120, %fd232;
- sub.f64 %fd233, %fd197, %fd227;
- mul.f64 %fd234, %fd229, %fd188;
- sub.f64 %fd235, %fd233, %fd234;
- mul.f64 %fd236, %fd226, %fd235;
- mov.b64 %rd121, %fd236;
- ld.shared.f64 %fd237, [%rd115+192];
- mul.f64 %fd238, %fd1, %fd237;
- ld.shared.f64 %fd239, [%rd115+8384];
- sub.f64 %fd240, %fd198, %fd239;
- ld.shared.f64 %fd241, [%rd115+9408];
- mul.f64 %fd242, %fd241, %fd189;
- sub.f64 %fd243, %fd240, %fd242;
- mul.f64 %fd244, %fd238, %fd243;
- mov.b64 %rd122, %fd244;
- sub.f64 %fd245, %fd199, %fd239;
- mul.f64 %fd246, %fd241, %fd190;
- sub.f64 %fd247, %fd245, %fd246;
- mul.f64 %fd248, %fd238, %fd247;
- mov.b64 %rd123, %fd248;
- shl.b32 %r298, %r4, 9;
- add.s32 %r299, %r289, %r298;
- mad.lo.s32 %r300, %r299, %r55, %r10;
- add.s32 %r301, %r300, %r11;
- mul.wide.s32 %rd124, %r301, 8;
- add.s64 %rd84, %rd13, %rd124;
- mov.b64 {%r272, %r273}, %rd116;
- mov.b64 {%r274, %r275}, %rd117;
-
- st.global.cs.v4.s32 [%rd84], {%r272,%r273,%r274,%r275};
-
- mov.b64 {%r276, %r277}, %rd118;
- mov.b64 {%r278, %r279}, %rd119;
- add.s64 %rd85, %rd84, %rd91;
-
- st.global.cs.v4.s32 [%rd85], {%r276,%r277,%r278,%r279};
-
- mov.b64 {%r280, %r281}, %rd120;
- mov.b64 {%r282, %r283}, %rd121;
- add.s64 %rd86, %rd85, %rd91;
-
- st.global.cs.v4.s32 [%rd86], {%r280,%r281,%r282,%r283};
-
- mov.b64 {%r284, %r285}, %rd122;
- mov.b64 {%r286, %r287}, %rd123;
- add.s64 %rd87, %rd86, %rd91;
-
- st.global.cs.v4.s32 [%rd87], {%r284,%r285,%r286,%r287};
+ ld.shared.f64 %fd220, [%rd129+64];
+ mul.f64 %fd221, %fd1, %fd220;
+ mul.f64 %fd222, %fd221, %fd219;
+ mov.b64 %rd134, %fd222;
+ sub.f64 %fd223, %fd195, %fd215;
+ mul.f64 %fd224, %fd217, %fd186;
+ sub.f64 %fd225, %fd223, %fd224;
+ ld.shared.f64 %fd226, [%rd129+320];
+ mul.f64 %fd227, %fd1, %fd226;
+ mul.f64 %fd228, %fd227, %fd225;
+ mov.b64 %rd135, %fd228;
+ add.s32 %r332, %r9, 16;
+ shr.s32 %r333, %r332, 31;
+ shr.u32 %r334, %r333, 25;
+ add.s32 %r335, %r332, %r334;
+ and.b32 %r336, %r335, -128;
+ sub.s32 %r337, %r332, %r336;
+ mul.wide.s32 %rd136, %r337, 8;
+ add.s64 %rd137, %rd126, %rd136;
+ ld.shared.f64 %fd229, [%rd137];
+ sub.f64 %fd230, %fd196, %fd229;
+ ld.shared.f64 %fd231, [%rd137+1024];
+ mul.f64 %fd232, %fd231, %fd187;
+ sub.f64 %fd233, %fd230, %fd232;
+ ld.shared.f64 %fd234, [%rd129+128];
+ mul.f64 %fd235, %fd1, %fd234;
+ mul.f64 %fd236, %fd235, %fd233;
+ mov.b64 %rd138, %fd236;
+ sub.f64 %fd237, %fd197, %fd229;
+ mul.f64 %fd238, %fd231, %fd188;
+ sub.f64 %fd239, %fd237, %fd238;
+ ld.shared.f64 %fd240, [%rd129+384];
+ mul.f64 %fd241, %fd1, %fd240;
+ mul.f64 %fd242, %fd241, %fd239;
+ mov.b64 %rd139, %fd242;
+ shr.s32 %r338, %r10, 31;
+ shr.u32 %r339, %r338, 25;
+ add.s32 %r340, %r10, %r339;
+ and.b32 %r341, %r340, -128;
+ sub.s32 %r342, %r10, %r341;
+ mul.wide.s32 %rd140, %r342, 8;
+ add.s64 %rd141, %rd126, %rd140;
+ ld.shared.f64 %fd243, [%rd141];
+ sub.f64 %fd244, %fd198, %fd243;
+ ld.shared.f64 %fd245, [%rd141+1024];
+ mul.f64 %fd246, %fd245, %fd189;
+ sub.f64 %fd247, %fd244, %fd246;
+ ld.shared.f64 %fd248, [%rd129+192];
+ mul.f64 %fd249, %fd1, %fd248;
+ mul.f64 %fd250, %fd249, %fd247;
+ mov.b64 %rd142, %fd250;
+ sub.f64 %fd251, %fd199, %fd243;
+ mul.f64 %fd252, %fd245, %fd190;
+ sub.f64 %fd253, %fd251, %fd252;
+ ld.shared.f64 %fd254, [%rd129+448];
+ mul.f64 %fd255, %fd1, %fd254;
+ mul.f64 %fd256, %fd255, %fd253;
+ mov.b64 %rd143, %fd256;
+ shl.b32 %r343, %r4, 9;
+ add.s32 %r344, %r310, %r343;
+ mad.lo.s32 %r345, %r344, %r57, %r308;
+ add.s32 %r346, %r345, %r11;
+ mul.wide.s32 %rd144, %r346, 8;
+ add.s64 %rd97, %rd17, %rd144;
+ mov.b64 {%r287, %r288}, %rd130;
+ mov.b64 {%r289, %r290}, %rd131;
+
+ st.global.cs.v4.s32 [%rd97], {%r287,%r288,%r289,%r290};
+
+ mov.b64 {%r291, %r292}, %rd134;
+ mov.b64 {%r293, %r294}, %rd135;
+ add.s64 %rd98, %rd97, %rd104;
+
+ st.global.cs.v4.s32 [%rd98], {%r291,%r292,%r293,%r294};
+
+ mov.b64 {%r295, %r296}, %rd138;
+ mov.b64 {%r297, %r298}, %rd139;
+ add.s64 %rd99, %rd98, %rd104;
+
+ st.global.cs.v4.s32 [%rd99], {%r295,%r296,%r297,%r298};
+
+ mov.b64 {%r299, %r300}, %rd142;
+ mov.b64 {%r301, %r302}, %rd143;
+ add.s64 %rd100, %rd99, %rd104;
+
+ st.global.cs.v4.s32 [%rd100], {%r299,%r300,%r301,%r302};
bra.uni $L__BB0_99;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f64 %fd107, [%rd8];
st.shared.f64 [%rd6+9216], %fd107;
$L__BB0_9:
- mov.u32 %r14, %ctaid.x;
- add.s32 %r86, %r55, 31;
- shr.s32 %r87, %r86, 31;
- shr.u32 %r88, %r87, 27;
- add.s32 %r89, %r86, %r88;
- shr.s32 %r15, %r89, 5;
- shl.b32 %r16, %r4, 4;
- shr.s32 %r90, %r2, 31;
- shr.u32 %r91, %r90, 28;
- add.s32 %r92, %r2, %r91;
- and.b32 %r93, %r92, 2147483632;
- sub.s32 %r94, %r2, %r93;
- shl.b32 %r95, %r94, 1;
- rem.s32 %r96, %r14, %r15;
- shl.b32 %r97, %r96, 5;
- add.s32 %r20, %r97, %r95;
+ mov.u32 %r13, %ctaid.x;
+ add.s32 %r82, %r57, 31;
+ shr.s32 %r83, %r82, 31;
+ shr.u32 %r84, %r83, 27;
+ add.s32 %r85, %r82, %r84;
+ shr.s32 %r14, %r85, 5;
+ shl.b32 %r15, %r4, 4;
+ shr.s32 %r86, %r2, 31;
+ shr.u32 %r87, %r86, 28;
+ add.s32 %r88, %r2, %r87;
+ and.b32 %r89, %r88, -16;
+ sub.s32 %r16, %r2, %r89;
+ shl.b32 %r90, %r16, 1;
+ rem.s32 %r91, %r13, %r14;
+ shl.b32 %r92, %r91, 5;
+ add.s32 %r20, %r92, %r90;
or.b32 %r17, %r20, 1;
- setp.ge.s32 %p8, %r17, %r55;
- shr.s32 %r18, %r92, 4;
+ setp.ge.s32 %p8, %r17, %r57;
+ shr.s32 %r18, %r88, 4;
add.s32 %r19, %r18, -216;
- mov.f64 %fd265, 0d0000000000000000;
- mov.f64 %fd267, 0d0000000000000000;
- mov.f64 %fd266, %fd267;
+ mov.f64 %fd273, 0d0000000000000000;
+ mov.f64 %fd275, 0d0000000000000000;
+ mov.f64 %fd274, %fd275;
@%p8 bra $L__BB0_12;
- div.s32 %r98, %r14, %r15;
- shl.b32 %r21, %r98, 5;
- add.s32 %r99, %r19, %r21;
- neg.s32 %r100, %r16;
- setp.ge.s32 %p9, %r99, %r100;
+ div.s32 %r93, %r13, %r14;
+ shl.b32 %r21, %r93, 5;
+ add.s32 %r94, %r19, %r21;
+ neg.s32 %r95, %r15;
+ setp.ge.s32 %p9, %r94, %r95;
@%p9 bra $L__BB0_12;
- add.s32 %r105, %r16, %r18;
- add.s32 %r106, %r105, %r21;
- mad.lo.s32 %r107, %r106, %r55, %r20;
- mul.wide.s32 %rd21, %r107, 8;
- add.s64 %rd20, %rd4, %rd21;
-
- ld.global.cs.v4.u32 {%r101,%r102,%r103,%r104}, [%rd20];
-
- mov.b64 %rd22, {%r101, %r102};
- mov.b64 %fd266, %rd22;
- mov.b64 %rd23, {%r103, %r104};
- mov.b64 %fd265, %rd23;
+ add.s32 %r100, %r15, %r18;
+ add.s32 %r101, %r100, %r21;
+ mad.lo.s32 %r102, %r101, %r57, %r20;
+ mul.wide.s32 %rd25, %r102, 8;
+ add.s64 %rd24, %rd4, %rd25;
+
+ ld.global.cs.v4.u32 {%r96,%r97,%r98,%r99}, [%rd24];
+
+ mov.b64 %rd26, {%r96, %r97};
+ mov.b64 %fd274, %rd26;
+ mov.b64 %rd27, {%r98, %r99};
+ mov.b64 %fd273, %rd27;
$L__BB0_12:
- mov.f64 %fd268, %fd267;
+ mov.f64 %fd276, %fd275;
@%p8 bra $L__BB0_15;
- div.s32 %r108, %r14, %r15;
- shl.b32 %r22, %r108, 5;
- add.s32 %r109, %r19, %r22;
- mov.u32 %r110, -8;
- sub.s32 %r111, %r110, %r16;
- setp.ge.s32 %p11, %r109, %r111;
- mov.f64 %fd268, %fd267;
+ div.s32 %r103, %r13, %r14;
+ shl.b32 %r22, %r103, 5;
+ add.s32 %r104, %r19, %r22;
+ mov.u32 %r105, -8;
+ sub.s32 %r106, %r105, %r15;
+ setp.ge.s32 %p11, %r104, %r106;
+ mov.f64 %fd276, %fd275;
@%p11 bra $L__BB0_15;
- add.s32 %r116, %r16, %r18;
- add.s32 %r117, %r116, %r22;
- add.s32 %r118, %r117, 8;
- mad.lo.s32 %r119, %r118, %r55, %r20;
- mul.wide.s32 %rd25, %r119, 8;
- add.s64 %rd24, %rd4, %rd25;
-
- ld.global.cs.v4.u32 {%r112,%r113,%r114,%r115}, [%rd24];
-
- mov.b64 %rd26, {%r112, %r113};
- mov.b64 %fd268, %rd26;
- mov.b64 %rd27, {%r114, %r115};
- mov.b64 %fd267, %rd27;
+ add.s32 %r111, %r15, %r18;
+ add.s32 %r112, %r111, %r22;
+ add.s32 %r113, %r112, 8;
+ mad.lo.s32 %r114, %r113, %r57, %r20;
+ mul.wide.s32 %rd29, %r114, 8;
+ add.s64 %rd28, %rd4, %rd29;
+
+ ld.global.cs.v4.u32 {%r107,%r108,%r109,%r110}, [%rd28];
+
+ mov.b64 %rd30, {%r107, %r108};
+ mov.b64 %fd276, %rd30;
+ mov.b64 %rd31, {%r109, %r110};
+ mov.b64 %fd275, %rd31;
$L__BB0_15:
- mov.f64 %fd269, 0d0000000000000000;
- mov.f64 %fd271, 0d0000000000000000;
- mov.f64 %fd270, %fd271;
+ mov.f64 %fd277, 0d0000000000000000;
+ mov.f64 %fd279, 0d0000000000000000;
+ mov.f64 %fd278, %fd279;
@%p8 bra $L__BB0_18;
- div.s32 %r120, %r14, %r15;
- shl.b32 %r23, %r120, 5;
- add.s32 %r121, %r19, %r23;
- mov.u32 %r122, -16;
- sub.s32 %r123, %r122, %r16;
- setp.ge.s32 %p13, %r121, %r123;
+ div.s32 %r115, %r13, %r14;
+ shl.b32 %r23, %r115, 5;
+ add.s32 %r116, %r19, %r23;
+ mov.u32 %r117, -16;
+ sub.s32 %r118, %r117, %r15;
+ setp.ge.s32 %p13, %r116, %r118;
@%p13 bra $L__BB0_18;
- add.s32 %r128, %r16, %r18;
- add.s32 %r129, %r128, %r23;
- add.s32 %r130, %r129, 16;
- mad.lo.s32 %r131, %r130, %r55, %r20;
- mul.wide.s32 %rd29, %r131, 8;
- add.s64 %rd28, %rd4, %rd29;
-
- ld.global.cs.v4.u32 {%r124,%r125,%r126,%r127}, [%rd28];
-
- mov.b64 %rd30, {%r124, %r125};
- mov.b64 %fd270, %rd30;
- mov.b64 %rd31, {%r126, %r127};
- mov.b64 %fd269, %rd31;
+ add.s32 %r123, %r15, %r18;
+ add.s32 %r124, %r123, %r23;
+ add.s32 %r125, %r124, 16;
+ mad.lo.s32 %r126, %r125, %r57, %r20;
+ mul.wide.s32 %rd33, %r126, 8;
+ add.s64 %rd32, %rd4, %rd33;
+
+ ld.global.cs.v4.u32 {%r119,%r120,%r121,%r122}, [%rd32];
+
+ mov.b64 %rd34, {%r119, %r120};
+ mov.b64 %fd278, %rd34;
+ mov.b64 %rd35, {%r121, %r122};
+ mov.b64 %fd277, %rd35;
$L__BB0_18:
- mov.f64 %fd272, %fd271;
+ mov.f64 %fd280, %fd279;
@%p8 bra $L__BB0_21;
- div.s32 %r132, %r14, %r15;
- shl.b32 %r24, %r132, 5;
- add.s32 %r133, %r19, %r24;
- mov.u32 %r134, -24;
- sub.s32 %r135, %r134, %r16;
- setp.ge.s32 %p15, %r133, %r135;
- mov.f64 %fd272, %fd271;
+ div.s32 %r127, %r13, %r14;
+ shl.b32 %r24, %r127, 5;
+ add.s32 %r128, %r19, %r24;
+ mov.u32 %r129, -24;
+ sub.s32 %r130, %r129, %r15;
+ setp.ge.s32 %p15, %r128, %r130;
+ mov.f64 %fd280, %fd279;
@%p15 bra $L__BB0_21;
- add.s32 %r140, %r16, %r18;
- add.s32 %r141, %r140, %r24;
- add.s32 %r142, %r141, 24;
- mad.lo.s32 %r143, %r142, %r55, %r20;
- mul.wide.s32 %rd33, %r143, 8;
- add.s64 %rd32, %rd4, %rd33;
-
- ld.global.cs.v4.u32 {%r136,%r137,%r138,%r139}, [%rd32];
-
- mov.b64 %rd34, {%r136, %r137};
- mov.b64 %fd272, %rd34;
- mov.b64 %rd35, {%r138, %r139};
- mov.b64 %fd271, %rd35;
+ add.s32 %r135, %r15, %r18;
+ add.s32 %r136, %r135, %r24;
+ add.s32 %r137, %r136, 24;
+ mad.lo.s32 %r138, %r137, %r57, %r20;
+ mul.wide.s32 %rd37, %r138, 8;
+ add.s64 %rd36, %rd4, %rd37;
+
+ ld.global.cs.v4.u32 {%r131,%r132,%r133,%r134}, [%rd36];
+
+ mov.b64 %rd38, {%r131, %r132};
+ mov.b64 %fd280, %rd38;
+ mov.b64 %rd39, {%r133, %r134};
+ mov.b64 %fd279, %rd39;
$L__BB0_21:
shl.b32 %r25, %r4, 6;
setp.gt.s32 %p16, %r2, 31;
@%p16 bra $L__BB0_24;
- div.s32 %r144, %r14, %r15;
- shl.b32 %r145, %r144, 5;
- add.s32 %r26, %r145, %r2;
+ div.s32 %r139, %r13, %r14;
+ shl.b32 %r140, %r139, 5;
+ add.s32 %r26, %r140, %r2;
setp.gt.s32 %p17, %r26, 215;
@%p17 bra $L__BB0_24;
- mul.wide.s32 %rd36, %r26, 8;
- add.s64 %rd37, %rd2, %rd36;
- ld.global.f64 %fd124, [%rd37];
+ mul.wide.s32 %rd40, %r26, 8;
+ add.s64 %rd41, %rd2, %rd40;
+ ld.global.f64 %fd124, [%rd41];
st.shared.f64 [%rd6+8192], %fd124;
$L__BB0_24:
- mov.f64 %fd273, 0d0000000000000000;
- mov.f64 %fd275, 0d0000000000000000;
- mov.f64 %fd274, %fd275;
+ mov.f64 %fd281, 0d0000000000000000;
+ mov.f64 %fd283, 0d0000000000000000;
+ mov.f64 %fd282, %fd283;
@%p8 bra $L__BB0_27;
- div.s32 %r146, %r14, %r15;
- shl.b32 %r27, %r146, 5;
- add.s32 %r147, %r19, %r27;
- neg.s32 %r148, %r25;
- setp.ge.s32 %p19, %r147, %r148;
+ div.s32 %r141, %r13, %r14;
+ shl.b32 %r27, %r141, 5;
+ add.s32 %r142, %r19, %r27;
+ neg.s32 %r143, %r25;
+ setp.ge.s32 %p19, %r142, %r143;
@%p19 bra $L__BB0_27;
- add.s32 %r153, %r25, %r18;
- add.s32 %r154, %r153, %r27;
- mad.lo.s32 %r155, %r154, %r55, %r20;
- mul.wide.s32 %rd39, %r155, 8;
- add.s64 %rd38, %rd3, %rd39;
-
- ld.global.cs.v4.u32 {%r149,%r150,%r151,%r152}, [%rd38];
-
- mov.b64 %rd40, {%r149, %r150};
- mov.b64 %fd274, %rd40;
- mov.b64 %rd41, {%r151, %r152};
- mov.b64 %fd273, %rd41;
+ add.s32 %r148, %r25, %r18;
+ add.s32 %r149, %r148, %r27;
+ mad.lo.s32 %r150, %r149, %r57, %r20;
+ mul.wide.s32 %rd43, %r150, 8;
+ add.s64 %rd42, %rd3, %rd43;
+
+ ld.global.cs.v4.u32 {%r144,%r145,%r146,%r147}, [%rd42];
+
+ mov.b64 %rd44, {%r144, %r145};
+ mov.b64 %fd282, %rd44;
+ mov.b64 %rd45, {%r146, %r147};
+ mov.b64 %fd281, %rd45;
$L__BB0_27:
- mov.f64 %fd276, %fd275;
+ mov.f64 %fd284, %fd283;
@%p8 bra $L__BB0_30;
- div.s32 %r156, %r14, %r15;
- shl.b32 %r28, %r156, 5;
- add.s32 %r157, %r19, %r28;
- mov.u32 %r158, -8;
- sub.s32 %r159, %r158, %r25;
- setp.ge.s32 %p21, %r157, %r159;
- mov.f64 %fd276, %fd275;
+ div.s32 %r151, %r13, %r14;
+ shl.b32 %r28, %r151, 5;
+ add.s32 %r152, %r19, %r28;
+ mov.u32 %r153, -8;
+ sub.s32 %r154, %r153, %r25;
+ setp.ge.s32 %p21, %r152, %r154;
+ mov.f64 %fd284, %fd283;
@%p21 bra $L__BB0_30;
- add.s32 %r164, %r25, %r18;
- add.s32 %r165, %r164, %r28;
- add.s32 %r166, %r165, 8;
- mad.lo.s32 %r167, %r166, %r55, %r20;
- mul.wide.s32 %rd43, %r167, 8;
- add.s64 %rd42, %rd3, %rd43;
-
- ld.global.cs.v4.u32 {%r160,%r161,%r162,%r163}, [%rd42];
-
- mov.b64 %rd44, {%r160, %r161};
- mov.b64 %fd276, %rd44;
- mov.b64 %rd45, {%r162, %r163};
- mov.b64 %fd275, %rd45;
+ add.s32 %r159, %r25, %r18;
+ add.s32 %r160, %r159, %r28;
+ add.s32 %r161, %r160, 8;
+ mad.lo.s32 %r162, %r161, %r57, %r20;
+ mul.wide.s32 %rd47, %r162, 8;
+ add.s64 %rd46, %rd3, %rd47;
+
+ ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd46];
+
+ mov.b64 %rd48, {%r155, %r156};
+ mov.b64 %fd284, %rd48;
+ mov.b64 %rd49, {%r157, %r158};
+ mov.b64 %fd283, %rd49;
$L__BB0_30:
- mov.f64 %fd277, 0d0000000000000000;
- mov.f64 %fd279, 0d0000000000000000;
- mov.f64 %fd278, %fd279;
+ mov.f64 %fd285, 0d0000000000000000;
+ mov.f64 %fd287, 0d0000000000000000;
+ mov.f64 %fd286, %fd287;
@%p8 bra $L__BB0_33;
- div.s32 %r168, %r14, %r15;
- shl.b32 %r29, %r168, 5;
- add.s32 %r169, %r19, %r29;
- mov.u32 %r170, -16;
- sub.s32 %r171, %r170, %r25;
- setp.ge.s32 %p23, %r169, %r171;
+ div.s32 %r163, %r13, %r14;
+ shl.b32 %r29, %r163, 5;
+ add.s32 %r164, %r19, %r29;
+ mov.u32 %r165, -16;
+ sub.s32 %r166, %r165, %r25;
+ setp.ge.s32 %p23, %r164, %r166;
@%p23 bra $L__BB0_33;
- add.s32 %r176, %r25, %r18;
- add.s32 %r177, %r176, %r29;
- add.s32 %r178, %r177, 16;
- mad.lo.s32 %r179, %r178, %r55, %r20;
- mul.wide.s32 %rd47, %r179, 8;
- add.s64 %rd46, %rd3, %rd47;
-
- ld.global.cs.v4.u32 {%r172,%r173,%r174,%r175}, [%rd46];
-
- mov.b64 %rd48, {%r172, %r173};
- mov.b64 %fd278, %rd48;
- mov.b64 %rd49, {%r174, %r175};
- mov.b64 %fd277, %rd49;
+ add.s32 %r171, %r25, %r18;
+ add.s32 %r172, %r171, %r29;
+ add.s32 %r173, %r172, 16;
+ mad.lo.s32 %r174, %r173, %r57, %r20;
+ mul.wide.s32 %rd51, %r174, 8;
+ add.s64 %rd50, %rd3, %rd51;
+
+ ld.global.cs.v4.u32 {%r167,%r168,%r169,%r170}, [%rd50];
+
+ mov.b64 %rd52, {%r167, %r168};
+ mov.b64 %fd286, %rd52;
+ mov.b64 %rd53, {%r169, %r170};
+ mov.b64 %fd285, %rd53;
$L__BB0_33:
- setp.lt.s32 %p24, %r17, %r55;
+ setp.lt.s32 %p24, %r17, %r57;
@%p24 bra $L__BB0_35;
bra.uni $L__BB0_34;
$L__BB0_35:
- div.s32 %r180, %r14, %r15;
- shl.b32 %r30, %r180, 5;
- add.s32 %r181, %r19, %r30;
- mov.u32 %r182, -24;
- sub.s32 %r183, %r182, %r25;
- setp.ge.s32 %p25, %r181, %r183;
- mov.f64 %fd280, %fd279;
+ div.s32 %r175, %r13, %r14;
+ shl.b32 %r30, %r175, 5;
+ add.s32 %r176, %r19, %r30;
+ mov.u32 %r177, -24;
+ sub.s32 %r178, %r177, %r25;
+ setp.ge.s32 %p25, %r176, %r178;
+ mov.f64 %fd288, %fd287;
@%p25 bra $L__BB0_37;
- add.s32 %r188, %r25, %r18;
- add.s32 %r189, %r188, %r30;
- add.s32 %r190, %r189, 24;
- mad.lo.s32 %r191, %r190, %r55, %r20;
- mul.wide.s32 %rd51, %r191, 8;
- add.s64 %rd50, %rd3, %rd51;
-
- ld.global.cs.v4.u32 {%r184,%r185,%r186,%r187}, [%rd50];
-
- mov.b64 %rd52, {%r184, %r185};
- mov.b64 %fd280, %rd52;
- mov.b64 %rd53, {%r186, %r187};
- mov.b64 %fd279, %rd53;
+ add.s32 %r183, %r25, %r18;
+ add.s32 %r184, %r183, %r30;
+ add.s32 %r185, %r184, 24;
+ mad.lo.s32 %r186, %r185, %r57, %r20;
+ mul.wide.s32 %rd55, %r186, 8;
+ add.s64 %rd54, %rd3, %rd55;
+
+ ld.global.cs.v4.u32 {%r179,%r180,%r181,%r182}, [%rd54];
+
+ mov.b64 %rd56, {%r179, %r180};
+ mov.b64 %fd288, %rd56;
+ mov.b64 %rd57, {%r181, %r182};
+ mov.b64 %fd287, %rd57;
bra.uni $L__BB0_37;
$L__BB0_34:
- mov.f64 %fd280, %fd279;
+ mov.f64 %fd288, %fd287;
$L__BB0_37:
- div.s32 %r192, %r14, %r15;
- shl.b32 %r31, %r192, 5;
- shr.u32 %r194, %r90, 27;
- add.s32 %r195, %r2, %r194;
- and.b32 %r196, %r195, -32;
- sub.s32 %r32, %r2, %r196;
- add.s32 %r197, %r31, %r32;
- setp.gt.s32 %p26, %r197, 215;
- mul.lo.s32 %r198, %r197, %r52;
- mul.wide.s32 %rd54, %r198, 8;
- add.s64 %rd9, %rd1, %rd54;
+ div.s32 %r187, %r13, %r14;
+ shl.b32 %r31, %r187, 5;
+ shr.u32 %r189, %r86, 27;
+ add.s32 %r190, %r2, %r189;
+ and.b32 %r191, %r190, -32;
+ sub.s32 %r192, %r2, %r191;
+ add.s32 %r193, %r31, %r192;
+ setp.gt.s32 %p26, %r193, 215;
+ mul.lo.s32 %r194, %r193, %r54;
+ mul.wide.s32 %rd58, %r194, 8;
+ add.s64 %rd9, %rd1, %rd58;
@%p26 bra $L__BB0_39;
- mul.wide.s32 %rd55, %r32, 8;
- add.s64 %rd57, %rd17, %rd55;
ld.global.f64 %fd141, [%rd9];
- st.shared.f64 [%rd57], %fd141;
+ st.shared.f64 [%rd6], %fd141;
+ st.shared.f64 [%rd6+1024], %fd141;
+ st.shared.f64 [%rd6+2048], %fd141;
+ st.shared.f64 [%rd6+3072], %fd141;
+ st.shared.f64 [%rd6+4096], %fd141;
+ st.shared.f64 [%rd6+5120], %fd141;
+ st.shared.f64 [%rd6+6144], %fd141;
+ st.shared.f64 [%rd6+7168], %fd141;
$L__BB0_39:
- shl.b32 %r33, %r4, 8;
+ shl.b32 %r32, %r4, 8;
barrier.sync 0;
- neg.s32 %r34, %r33;
- add.s32 %r35, %r19, %r31;
- setp.ge.s32 %p27, %r35, %r34;
- mul.wide.s32 %rd58, %r18, 8;
- add.s64 %rd10, %rd17, %rd58;
- mov.f64 %fd282, 0d0000000000000000;
- mov.f64 %fd281, %fd282;
+ neg.s32 %r33, %r32;
+ add.s32 %r34, %r19, %r31;
+ setp.ge.s32 %p27, %r34, %r33;
+ shr.s32 %r195, %r18, 31;
+ shr.u32 %r196, %r195, 25;
+ add.s32 %r197, %r18, %r196;
+ and.b32 %r198, %r197, -128;
+ sub.s32 %r199, %r18, %r198;
+ mul.wide.s32 %rd59, %r199, 8;
+ add.s64 %rd61, %rd21, %rd59;
+ add.s64 %rd10, %rd61, 8192;
+ mov.f64 %fd290, 0d0000000000000000;
+ mov.f64 %fd289, %fd290;
@%p27 bra $L__BB0_41;
- ld.shared.f64 %fd281, [%rd10+8192];
+ ld.shared.f64 %fd289, [%rd10];
$L__BB0_41:
- sub.f64 %fd36, %fd274, %fd281;
+ sub.f64 %fd36, %fd282, %fd289;
@%p27 bra $L__BB0_43;
- ld.shared.f64 %fd282, [%rd10+9216];
+ ld.shared.f64 %fd290, [%rd10+1024];
$L__BB0_43:
- mul.f64 %fd145, %fd282, %fd266;
+ mul.f64 %fd145, %fd290, %fd274;
sub.f64 %fd39, %fd36, %fd145;
- mov.f64 %fd284, 0d0000000000000000;
- mov.f64 %fd283, %fd284;
+ shl.b32 %r200, %r16, 6;
+ add.s32 %r201, %r200, %r18;
+ mul.wide.s32 %rd62, %r201, 8;
+ add.s64 %rd11, %rd21, %rd62;
+ mov.f64 %fd292, 0d0000000000000000;
+ mov.f64 %fd291, %fd292;
@%p27 bra $L__BB0_45;
- ld.shared.f64 %fd146, [%rd10];
- mul.f64 %fd283, %fd1, %fd146;
+ ld.shared.f64 %fd146, [%rd11];
+ mul.f64 %fd291, %fd1, %fd146;
$L__BB0_45:
- mul.f64 %fd42, %fd39, %fd283;
+ mul.f64 %fd42, %fd39, %fd291;
@%p27 bra $L__BB0_47;
- ld.shared.f64 %fd284, [%rd10+8192];
+ ld.shared.f64 %fd292, [%rd10];
$L__BB0_47:
- sub.f64 %fd45, %fd273, %fd284;
- mov.f64 %fd286, 0d0000000000000000;
- mov.f64 %fd285, %fd286;
+ sub.f64 %fd45, %fd281, %fd292;
+ mov.f64 %fd294, 0d0000000000000000;
+ mov.f64 %fd293, %fd294;
@%p27 bra $L__BB0_49;
- ld.shared.f64 %fd285, [%rd10+9216];
+ ld.shared.f64 %fd293, [%rd10+1024];
$L__BB0_49:
- mul.f64 %fd150, %fd285, %fd265;
+ mul.f64 %fd150, %fd293, %fd273;
sub.f64 %fd48, %fd45, %fd150;
@%p27 bra $L__BB0_51;
- ld.shared.f64 %fd151, [%rd10];
- mul.f64 %fd286, %fd1, %fd151;
+ ld.shared.f64 %fd151, [%rd11+256];
+ mul.f64 %fd294, %fd1, %fd151;
$L__BB0_51:
- mul.f64 %fd51, %fd48, %fd286;
- mov.u32 %r199, -8;
- sub.s32 %r36, %r199, %r33;
- setp.ge.s32 %p33, %r35, %r36;
- mov.f64 %fd288, 0d0000000000000000;
- mov.f64 %fd287, %fd288;
+ mul.f64 %fd51, %fd48, %fd294;
+ mov.u32 %r202, -8;
+ sub.s32 %r35, %r202, %r32;
+ setp.ge.s32 %p33, %r34, %r35;
+ add.s32 %r36, %r18, 8;
+ shr.s32 %r203, %r36, 31;
+ shr.u32 %r204, %r203, 25;
+ add.s32 %r205, %r36, %r204;
+ and.b32 %r206, %r205, -128;
+ sub.s32 %r207, %r36, %r206;
+ mul.wide.s32 %rd64, %r207, 8;
+ add.s64 %rd66, %rd21, %rd64;
+ add.s64 %rd12, %rd66, 8192;
+ mov.f64 %fd296, 0d0000000000000000;
+ mov.f64 %fd295, %fd296;
@%p33 bra $L__BB0_53;
- ld.shared.f64 %fd287, [%rd10+8256];
+ ld.shared.f64 %fd295, [%rd12];
$L__BB0_53:
- sub.f64 %fd54, %fd276, %fd287;
+ sub.f64 %fd54, %fd284, %fd295;
@%p33 bra $L__BB0_55;
- ld.shared.f64 %fd288, [%rd10+9280];
+ ld.shared.f64 %fd296, [%rd12+1024];
$L__BB0_55:
- mul.f64 %fd155, %fd288, %fd268;
+ mul.f64 %fd155, %fd296, %fd276;
sub.f64 %fd57, %fd54, %fd155;
- mov.f64 %fd290, 0d0000000000000000;
- mov.f64 %fd289, %fd290;
+ mov.f64 %fd298, 0d0000000000000000;
+ mov.f64 %fd297, %fd298;
@%p33 bra $L__BB0_57;
- ld.shared.f64 %fd156, [%rd10+64];
- mul.f64 %fd289, %fd1, %fd156;
+ ld.shared.f64 %fd156, [%rd11+64];
+ mul.f64 %fd297, %fd1, %fd156;
$L__BB0_57:
- mul.f64 %fd60, %fd57, %fd289;
+ mul.f64 %fd60, %fd57, %fd297;
@%p33 bra $L__BB0_59;
- ld.shared.f64 %fd290, [%rd10+8256];
+ ld.shared.f64 %fd298, [%rd12];
$L__BB0_59:
- sub.f64 %fd63, %fd275, %fd290;
- mov.f64 %fd292, 0d0000000000000000;
- mov.f64 %fd291, %fd292;
+ sub.f64 %fd63, %fd283, %fd298;
+ mov.f64 %fd300, 0d0000000000000000;
+ mov.f64 %fd299, %fd300;
@%p33 bra $L__BB0_61;
- ld.shared.f64 %fd291, [%rd10+9280];
+ ld.shared.f64 %fd299, [%rd12+1024];
$L__BB0_61:
- mul.f64 %fd160, %fd291, %fd267;
+ mul.f64 %fd160, %fd299, %fd275;
sub.f64 %fd66, %fd63, %fd160;
@%p33 bra $L__BB0_63;
- ld.shared.f64 %fd161, [%rd10+64];
- mul.f64 %fd292, %fd1, %fd161;
+ ld.shared.f64 %fd161, [%rd11+320];
+ mul.f64 %fd300, %fd1, %fd161;
$L__BB0_63:
- mul.f64 %fd69, %fd66, %fd292;
- mov.u32 %r200, -16;
- sub.s32 %r37, %r200, %r33;
- setp.ge.s32 %p39, %r35, %r37;
- mov.f64 %fd294, 0d0000000000000000;
- mov.f64 %fd293, %fd294;
+ mul.f64 %fd69, %fd66, %fd300;
+ mov.u32 %r208, -16;
+ sub.s32 %r37, %r208, %r32;
+ setp.ge.s32 %p39, %r34, %r37;
+ add.s32 %r38, %r18, 16;
+ shr.s32 %r209, %r38, 31;
+ shr.u32 %r210, %r209, 25;
+ add.s32 %r211, %r38, %r210;
+ and.b32 %r212, %r211, -128;
+ sub.s32 %r213, %r38, %r212;
+ mul.wide.s32 %rd67, %r213, 8;
+ add.s64 %rd69, %rd21, %rd67;
+ add.s64 %rd13, %rd69, 8192;
+ mov.f64 %fd302, 0d0000000000000000;
+ mov.f64 %fd301, %fd302;
@%p39 bra $L__BB0_65;
- ld.shared.f64 %fd293, [%rd10+8320];
+ ld.shared.f64 %fd301, [%rd13];
$L__BB0_65:
- sub.f64 %fd72, %fd278, %fd293;
+ sub.f64 %fd72, %fd286, %fd301;
@%p39 bra $L__BB0_67;
- ld.shared.f64 %fd294, [%rd10+9344];
+ ld.shared.f64 %fd302, [%rd13+1024];
$L__BB0_67:
- mul.f64 %fd165, %fd294, %fd270;
+ mul.f64 %fd165, %fd302, %fd278;
sub.f64 %fd75, %fd72, %fd165;
- mov.f64 %fd296, 0d0000000000000000;
- mov.f64 %fd295, %fd296;
+ mov.f64 %fd304, 0d0000000000000000;
+ mov.f64 %fd303, %fd304;
@%p39 bra $L__BB0_69;
- ld.shared.f64 %fd166, [%rd10+128];
- mul.f64 %fd295, %fd1, %fd166;
+ ld.shared.f64 %fd166, [%rd11+128];
+ mul.f64 %fd303, %fd1, %fd166;
$L__BB0_69:
- mul.f64 %fd78, %fd75, %fd295;
+ mul.f64 %fd78, %fd75, %fd303;
@%p39 bra $L__BB0_71;
- ld.shared.f64 %fd296, [%rd10+8320];
+ ld.shared.f64 %fd304, [%rd13];
$L__BB0_71:
- sub.f64 %fd81, %fd277, %fd296;
- mov.f64 %fd298, 0d0000000000000000;
- mov.f64 %fd297, %fd298;
+ sub.f64 %fd81, %fd285, %fd304;
+ mov.f64 %fd306, 0d0000000000000000;
+ mov.f64 %fd305, %fd306;
@%p39 bra $L__BB0_73;
- ld.shared.f64 %fd297, [%rd10+9344];
+ ld.shared.f64 %fd305, [%rd13+1024];
$L__BB0_73:
- mul.f64 %fd170, %fd297, %fd269;
+ mul.f64 %fd170, %fd305, %fd277;
sub.f64 %fd84, %fd81, %fd170;
@%p39 bra $L__BB0_75;
- ld.shared.f64 %fd171, [%rd10+128];
- mul.f64 %fd298, %fd1, %fd171;
+ ld.shared.f64 %fd171, [%rd11+384];
+ mul.f64 %fd306, %fd1, %fd171;
$L__BB0_75:
- mul.f64 %fd87, %fd84, %fd298;
- mov.u32 %r201, -24;
- sub.s32 %r38, %r201, %r33;
- setp.ge.s32 %p45, %r35, %r38;
- mov.f64 %fd300, 0d0000000000000000;
- mov.f64 %fd299, %fd300;
+ mul.f64 %fd87, %fd84, %fd306;
+ mov.u32 %r214, -24;
+ sub.s32 %r39, %r214, %r32;
+ setp.ge.s32 %p45, %r34, %r39;
+ add.s32 %r40, %r18, 24;
+ shr.s32 %r215, %r40, 31;
+ shr.u32 %r216, %r215, 25;
+ add.s32 %r217, %r40, %r216;
+ and.b32 %r218, %r217, -128;
+ sub.s32 %r219, %r40, %r218;
+ mul.wide.s32 %rd70, %r219, 8;
+ add.s64 %rd72, %rd21, %rd70;
+ add.s64 %rd14, %rd72, 8192;
+ mov.f64 %fd308, 0d0000000000000000;
+ mov.f64 %fd307, %fd308;
@%p45 bra $L__BB0_77;
- ld.shared.f64 %fd299, [%rd10+8384];
+ ld.shared.f64 %fd307, [%rd14];
$L__BB0_77:
- sub.f64 %fd90, %fd280, %fd299;
+ sub.f64 %fd90, %fd288, %fd307;
@%p45 bra $L__BB0_79;
- ld.shared.f64 %fd300, [%rd10+9408];
+ ld.shared.f64 %fd308, [%rd14+1024];
$L__BB0_79:
- mul.f64 %fd175, %fd300, %fd272;
+ mul.f64 %fd175, %fd308, %fd280;
sub.f64 %fd93, %fd90, %fd175;
- mov.f64 %fd302, 0d0000000000000000;
- mov.f64 %fd301, %fd302;
+ mov.f64 %fd310, 0d0000000000000000;
+ mov.f64 %fd309, %fd310;
@%p45 bra $L__BB0_81;
- ld.shared.f64 %fd176, [%rd10+192];
- mul.f64 %fd301, %fd1, %fd176;
+ ld.shared.f64 %fd176, [%rd11+192];
+ mul.f64 %fd309, %fd1, %fd176;
$L__BB0_81:
- mul.f64 %fd96, %fd93, %fd301;
+ mul.f64 %fd96, %fd93, %fd309;
@%p45 bra $L__BB0_83;
- ld.shared.f64 %fd302, [%rd10+8384];
+ ld.shared.f64 %fd310, [%rd14];
$L__BB0_83:
- sub.f64 %fd99, %fd279, %fd302;
- mov.f64 %fd304, 0d0000000000000000;
- mov.f64 %fd303, %fd304;
+ sub.f64 %fd99, %fd287, %fd310;
+ mov.f64 %fd312, 0d0000000000000000;
+ mov.f64 %fd311, %fd312;
@%p45 bra $L__BB0_85;
- ld.shared.f64 %fd303, [%rd10+9408];
+ ld.shared.f64 %fd311, [%rd14+1024];
$L__BB0_85:
- mul.f64 %fd180, %fd303, %fd271;
+ mul.f64 %fd180, %fd311, %fd279;
sub.f64 %fd102, %fd99, %fd180;
@%p45 bra $L__BB0_87;
- ld.shared.f64 %fd181, [%rd10+192];
- mul.f64 %fd304, %fd1, %fd181;
+ ld.shared.f64 %fd181, [%rd11+448];
+ mul.f64 %fd312, %fd1, %fd181;
$L__BB0_87:
- mul.f64 %fd105, %fd102, %fd304;
- shl.b32 %r39, %r4, 9;
+ mul.f64 %fd105, %fd102, %fd312;
+ shl.b32 %r41, %r4, 9;
@%p8 bra $L__BB0_90;
- neg.s32 %r202, %r39;
- setp.ge.s32 %p52, %r35, %r202;
+ neg.s32 %r220, %r41;
+ setp.ge.s32 %p52, %r34, %r220;
@%p52 bra $L__BB0_90;
- add.s32 %r207, %r39, %r18;
- add.s32 %r208, %r207, %r31;
- mad.lo.s32 %r209, %r208, %r55, %r20;
- mul.wide.s32 %rd61, %r209, 8;
- add.s64 %rd60, %rd13, %rd61;
- mov.b64 %rd62, %fd42;
- mov.b64 {%r203, %r204}, %rd62;
- mov.b64 %rd63, %fd51;
- mov.b64 {%r205, %r206}, %rd63;
-
- st.global.cs.v4.s32 [%rd60], {%r203,%r204,%r205,%r206};
+ add.s32 %r225, %r41, %r18;
+ add.s32 %r226, %r225, %r31;
+ mad.lo.s32 %r227, %r226, %r57, %r20;
+ mul.wide.s32 %rd74, %r227, 8;
+ add.s64 %rd73, %rd17, %rd74;
+ mov.b64 %rd75, %fd42;
+ mov.b64 {%r221, %r222}, %rd75;
+ mov.b64 %rd76, %fd51;
+ mov.b64 {%r223, %r224}, %rd76;
+
+ st.global.cs.v4.s32 [%rd73], {%r221,%r222,%r223,%r224};
$L__BB0_90:
- mov.u32 %r210, -8;
- sub.s32 %r211, %r210, %r39;
- setp.ge.s32 %p54, %r35, %r211;
+ mov.u32 %r228, -8;
+ sub.s32 %r229, %r228, %r41;
+ setp.ge.s32 %p54, %r34, %r229;
or.pred %p55, %p8, %p54;
@%p55 bra $L__BB0_92;
- add.s32 %r216, %r39, %r18;
- add.s32 %r217, %r216, %r31;
- add.s32 %r218, %r217, 8;
- mad.lo.s32 %r219, %r218, %r55, %r20;
- mul.wide.s32 %rd65, %r219, 8;
- add.s64 %rd64, %rd13, %rd65;
- mov.b64 %rd66, %fd60;
- mov.b64 {%r212, %r213}, %rd66;
- mov.b64 %rd67, %fd69;
- mov.b64 {%r214, %r215}, %rd67;
-
- st.global.cs.v4.s32 [%rd64], {%r212,%r213,%r214,%r215};
+ add.s32 %r234, %r36, %r41;
+ add.s32 %r235, %r234, %r31;
+ mad.lo.s32 %r236, %r235, %r57, %r20;
+ mul.wide.s32 %rd78, %r236, 8;
+ add.s64 %rd77, %rd17, %rd78;
+ mov.b64 %rd79, %fd60;
+ mov.b64 {%r230, %r231}, %rd79;
+ mov.b64 %rd80, %fd69;
+ mov.b64 {%r232, %r233}, %rd80;
+
+ st.global.cs.v4.s32 [%rd77], {%r230,%r231,%r232,%r233};
$L__BB0_92:
@%p8 bra $L__BB0_95;
- mov.u32 %r220, -16;
- sub.s32 %r221, %r220, %r39;
- setp.ge.s32 %p57, %r35, %r221;
+ mov.u32 %r237, -16;
+ sub.s32 %r238, %r237, %r41;
+ setp.ge.s32 %p57, %r34, %r238;
@%p57 bra $L__BB0_95;
- add.s32 %r226, %r39, %r18;
- add.s32 %r227, %r226, %r31;
- add.s32 %r228, %r227, 16;
- mad.lo.s32 %r229, %r228, %r55, %r20;
- mul.wide.s32 %rd69, %r229, 8;
- add.s64 %rd68, %rd13, %rd69;
- mov.b64 %rd70, %fd78;
- mov.b64 {%r222, %r223}, %rd70;
- mov.b64 %rd71, %fd87;
- mov.b64 {%r224, %r225}, %rd71;
-
- st.global.cs.v4.s32 [%rd68], {%r222,%r223,%r224,%r225};
+ add.s32 %r243, %r38, %r41;
+ add.s32 %r244, %r243, %r31;
+ mad.lo.s32 %r245, %r244, %r57, %r20;
+ mul.wide.s32 %rd82, %r245, 8;
+ add.s64 %rd81, %rd17, %rd82;
+ mov.b64 %rd83, %fd78;
+ mov.b64 {%r239, %r240}, %rd83;
+ mov.b64 %rd84, %fd87;
+ mov.b64 {%r241, %r242}, %rd84;
+
+ st.global.cs.v4.s32 [%rd81], {%r239,%r240,%r241,%r242};
$L__BB0_95:
@%p8 bra $L__BB0_99;
- mov.u32 %r230, -24;
- sub.s32 %r231, %r230, %r39;
- setp.ge.s32 %p59, %r35, %r231;
+ mov.u32 %r246, -24;
+ sub.s32 %r247, %r246, %r41;
+ setp.ge.s32 %p59, %r34, %r247;
@%p59 bra $L__BB0_99;
- add.s32 %r236, %r39, %r18;
- add.s32 %r237, %r236, %r31;
- add.s32 %r238, %r237, 24;
- mad.lo.s32 %r239, %r238, %r55, %r20;
- mul.wide.s32 %rd73, %r239, 8;
- add.s64 %rd72, %rd13, %rd73;
- mov.b64 %rd74, %fd96;
- mov.b64 {%r232, %r233}, %rd74;
- mov.b64 %rd75, %fd105;
- mov.b64 {%r234, %r235}, %rd75;
-
- st.global.cs.v4.s32 [%rd72], {%r232,%r233,%r234,%r235};
+ add.s32 %r252, %r40, %r41;
+ add.s32 %r253, %r252, %r31;
+ mad.lo.s32 %r254, %r253, %r57, %r20;
+ mul.wide.s32 %rd86, %r254, 8;
+ add.s64 %rd85, %rd17, %rd86;
+ mov.b64 %rd87, %fd96;
+ mov.b64 {%r248, %r249}, %rd87;
+ mov.b64 %rd88, %fd105;
+ mov.b64 {%r250, %r251}, %rd88;
+
+ st.global.cs.v4.s32 [%rd85], {%r248,%r249,%r250,%r251};
$L__BB0_99:
ret;
7: CombinedSchedulerTest.LayerNormBackward/dtype_float_batch_216_hidden_32
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<390>;
.reg .b32 %r<459>;
.reg .f64 %fd<3>;
.reg .b64 %rd<156>;
ld.param.v2.u32 {%r160, %r161}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r170, %r171}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r174, %r175}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r196, %r161, 3;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 30;
add.s32 %r199, %r196, %r198;
shr.s32 %r200, %r199, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r201, %r200, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r202, %r3, 2;
mad.lo.s32 %r203, %r202, %r201, 15;
and.b32 %r204, %r203, -16;
cvt.u64.u32 %rd1, %r204;
mul.lo.s32 %r205, %r3, %r200;
shl.b32 %r206, %r205, 4;
or.b32 %r207, %r206, 15;
and.b32 %r4, %r207, -16;
add.s32 %r208, %r207, %r4;
and.b32 %r209, %r208, -16;
cvt.s64.s32 %rd2, %r209;
mov.u64 %rd43, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p9, %r5, %r200;
shl.b32 %r6, %r5, 2;
or.b32 %r210, %r6, 3;
setp.lt.s32 %p10, %r210, %r161;
and.pred %p1, %p10, %p9;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p11, %r7, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r211, smem_ptr; }
// end inline asm
shl.b32 %r214, %r5, 4;
add.s32 %r212, %r211, %r214;
mul.wide.s32 %rd47, %r6, 4;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r213, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r213, 0;
cp.async.ca.shared.global [%r212], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r215, %r3, 215;
div.s32 %r216, %r215, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r217, %r8, %r216;
add.s32 %r218, %r217, -1;
div.s32 %r9, %r218, %r8;
setp.gt.s32 %p13, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r161;
cvt.s64.s32 %rd48, %r4;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r220, %ctaid.y;
mul.lo.s32 %r221, %r9, %r3;
mul.lo.s32 %r10, %r221, %r220;
shl.b32 %r222, %r7, 2;
shl.b32 %r223, %r5, 4;
mad.lo.s32 %r11, %r222, %r161, %r223;
mul.lo.s32 %r224, %r161, %r7;
cvt.s64.s32 %rd52, %r224;
cvt.s64.s32 %rd53, %r6;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r225, %r10, %r161;
cvt.s64.s32 %rd6, %r225;
mul.lo.s32 %r12, %r161, %r3;
mul.lo.s32 %r13, %r9, %r220;
add.s32 %r14, %r224, %r6;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r14, 4;
add.s64 %rd7, %rd54, %rd55;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r226, %tid.z;
mad.lo.s32 %r227, %r3, %r226, %r7;
mad.lo.s32 %r15, %r227, %r2, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r228, %r2;
mov.u32 %r229, 31;
sub.s32 %r230, %r229, %r228;
mov.u32 %r231, 1;
shl.b32 %r16, %r231, %r230;
setp.lt.u32 %p14, %r5, %r16;
add.s32 %r232, %r16, %r5;
setp.lt.u32 %p15, %r232, %r2;
and.pred %p3, %p14, %p15;
add.s32 %r233, %r15, %r16;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r234, %r16, 31;
add.s32 %r235, %r16, %r234;
shr.s32 %r17, %r235, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r236, %r15, 1;
mul.wide.u32 %rd58, %r236, 4;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r6, 4;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r227, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
mov.u32 %r417, 0;
mov.f32 %f354, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r239, smem_ptr; }
// end inline asm
add.s32 %r240, %r11, %r239;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r265, smem_ptr; }
// end inline asm
add.s32 %r266, %r11, %r265;
not.pred %p26, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r237, %r417, %r3, %r7;
add.s32 %r238, %r237, %r10;
setp.gt.s32 %p17, %r238, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r242, %r12, %r417;
cvt.s64.s32 %rd65, %r242;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r241, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r241, 0;
cp.async.ca.shared.global [%r240], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r243, %r13, %r417;
mad.lo.s32 %r244, %r243, %r3, %r7;
setp.lt.s32 %p19, %r244, 216;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r253, %r13, %r417;
mad.lo.s32 %r254, %r253, %r3, %r7;
setp.gt.s32 %p20, %r254, 215;
mov.u32 %r418, 0;
mov.u32 %r419, %r418;
mov.u32 %r420, %r418;
mov.u32 %r421, %r418;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r418, 0;
mov.u32 %r419, %r418;
mov.u32 %r420, %r418;
mov.u32 %r421, %r418;
$L__BB0_15:
add.s32 %r263, %r13, %r417;
mad.lo.s32 %r33, %r263, %r3, %r7;
mov.b32 %f112, %r421;
add.f32 %f369, %f369, %f112;
mov.b32 %f113, %r420;
add.f32 %f368, %f368, %f113;
mov.b32 %f114, %r419;
add.f32 %f367, %f367, %f114;
mov.b32 %f115, %r418;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p21, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r264, %r33, %r170;
mul.wide.s32 %rd69, %r264, 4;
add.s64 %rd70, %rd16, %rd69;
ld.global.f32 %f352, [%rd70];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r268, %r12, %r417;
cvt.s64.s32 %rd73, %r268;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r267, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r267, 0;
cp.async.ca.shared.global [%r266], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r416, %r13, %r417;
mad.lo.s32 %r415, %r416, %r3, %r7;
setp.gt.s32 %p142, %r415, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r269, %r33, %r174;
mul.wide.s32 %rd77, %r269, 4;
add.s64 %rd78, %rd17, %rd77;
ld.global.f32 %f353, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f359, %f358;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd7];
mul.f32 %f129, %f119, %f124;
add.f32 %f130, %f129, 0f00000000;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
sub.f32 %f136, %f131, %f352;
mul.f32 %f137, %f353, %f136;
fma.rn.f32 %f138, %f129, %f137, 0f00000000;
fma.rn.f32 %f354, %f137, %f124, %f354;
mul.f32 %f141, %f120, %f125;
add.f32 %f142, %f130, %f141;
sub.f32 %f144, %f132, %f352;
mul.f32 %f145, %f353, %f144;
fma.rn.f32 %f146, %f141, %f145, %f138;
fma.rn.f32 %f355, %f145, %f125, %f355;
mul.f32 %f149, %f121, %f126;
add.f32 %f150, %f142, %f149;
sub.f32 %f152, %f133, %f352;
mul.f32 %f153, %f353, %f152;
fma.rn.f32 %f154, %f149, %f153, %f146;
fma.rn.f32 %f356, %f153, %f126, %f356;
mul.f32 %f157, %f122, %f127;
add.f32 %f359, %f150, %f157;
sub.f32 %f159, %f134, %f352;
mul.f32 %f160, %f353, %f159;
fma.rn.f32 %f358, %f157, %f160, %f154;
fma.rn.f32 %f357, %f160, %f127, %f357;
$L__BB0_23:
st.shared.f32 [%rd8], %f359;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r422, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r5, %r422;
@%p28 bra $L__BB0_29;
add.s32 %r270, %r422, %r15;
mul.wide.s32 %rd79, %r270, 4;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd81];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r422, 1;
setp.gt.u32 %p29, %r422, 3;
mov.u32 %r422, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r5, 0;
mov.f32 %f360, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r2, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f360, %f360, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f358;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
mov.u32 %r423, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r5, %r423;
@%p34 bra $L__BB0_39;
add.s32 %r271, %r423, %r15;
mul.wide.s32 %rd82, %r271, 4;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd84];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r423, 1;
setp.gt.u32 %p35, %r423, 3;
mov.u32 %r423, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r2, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f361, %f361, %f178;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f360;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f361;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f179, %f353, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd12];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd10];
sub.f32 %f197, %f192, %f352;
mul.f32 %f198, %f353, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r272, %f202;
mul.f32 %f205, %f181, %f186;
mul.f32 %f206, %f205, %f2;
sub.f32 %f208, %f193, %f352;
mul.f32 %f209, %f353, %f208;
sub.f32 %f210, %f206, %f37;
mul.f32 %f211, %f38, %f209;
sub.f32 %f212, %f210, %f211;
mul.f32 %f213, %f179, %f212;
mov.b32 %r273, %f213;
mul.f32 %f216, %f182, %f187;
mul.f32 %f217, %f216, %f2;
sub.f32 %f219, %f194, %f352;
mul.f32 %f220, %f353, %f219;
sub.f32 %f221, %f217, %f37;
mul.f32 %f222, %f38, %f220;
sub.f32 %f223, %f221, %f222;
mul.f32 %f224, %f179, %f223;
mov.b32 %r274, %f224;
mul.f32 %f227, %f183, %f188;
mul.f32 %f228, %f227, %f2;
sub.f32 %f230, %f195, %f352;
mul.f32 %f231, %f353, %f230;
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r275, %f235;
mad.lo.s32 %r276, %r417, %r3, %r10;
mad.lo.s32 %r277, %r276, %r161, %r14;
mul.wide.s32 %rd86, %r277, 4;
add.s64 %rd85, %rd37, %rd86;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r272,%r273,%r274,%r275};
// end inline asm
$L__BB0_49:
add.s32 %r417, %r417, 1;
setp.lt.s32 %p41, %r417, %r9;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
mov.u32 %r278, %tid.z;
mad.lo.s32 %r279, %r3, %r278, %r7;
mad.lo.s32 %r39, %r279, %r2, %r5;
mul.wide.u32 %rd87, %r39, 4;
add.s64 %rd23, %rd43, %rd87;
clz.b32 %r280, %r3;
mov.u32 %r281, 31;
sub.s32 %r282, %r281, %r280;
mov.u32 %r283, 1;
shl.b32 %r40, %r283, %r282;
setp.lt.u32 %p42, %r7, %r40;
add.s32 %r284, %r40, %r7;
setp.lt.u32 %p43, %r284, %r3;
and.pred %p5, %p42, %p43;
shl.b32 %r285, %r2, %r282;
add.s32 %r286, %r39, %r285;
mul.wide.s32 %rd89, %r286, 4;
add.s64 %rd24, %rd43, %rd89;
shr.u32 %r287, %r40, 31;
add.s32 %r288, %r40, %r287;
shr.s32 %r438, %r288, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f236, [%rd24];
ld.shared.f32 %f237, [%rd23];
add.f32 %f238, %f236, %f237;
st.shared.f32 [%rd23], %f238;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r424, %r438;
$L__BB0_54:
setp.ge.u32 %p46, %r7, %r424;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r289, %r424, %r2, %r39;
mul.wide.s32 %rd90, %r289, 4;
add.s64 %rd92, %rd43, %rd90;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd92];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r424, 1;
setp.gt.u32 %p47, %r424, 3;
mov.u32 %r424, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r291, %r39, %r2;
mul.wide.u32 %rd93, %r291, 4;
add.s64 %rd25, %rd43, %rd93;
setp.ne.s32 %p48, %r7, 0;
mov.u32 %r425, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r3, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
mov.b32 %r425, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f244, [%rd24];
ld.shared.f32 %f245, [%rd23];
add.f32 %f246, %f244, %f245;
st.shared.f32 [%rd23], %f246;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r426, %r438;
$L__BB0_65:
setp.ge.u32 %p52, %r7, %r426;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r292, %r426, %r2, %r39;
mul.wide.s32 %rd95, %r292, 4;
add.s64 %rd97, %rd43, %rd95;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd97];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r426, 1;
setp.gt.u32 %p53, %r426, 3;
mov.u32 %r426, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r427, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r3, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
mov.b32 %r427, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f252, [%rd24];
ld.shared.f32 %f253, [%rd23];
add.f32 %f254, %f252, %f253;
st.shared.f32 [%rd23], %f254;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r428, %r438;
$L__BB0_76:
setp.ge.u32 %p58, %r7, %r428;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r294, %r428, %r2, %r39;
mul.wide.s32 %rd98, %r294, 4;
add.s64 %rd100, %rd43, %rd98;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd100];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r428, 1;
setp.gt.u32 %p59, %r428, 3;
mov.u32 %r428, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r429, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r3, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
mov.b32 %r429, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f260, [%rd24];
ld.shared.f32 %f261, [%rd23];
add.f32 %f262, %f260, %f261;
st.shared.f32 [%rd23], %f262;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r430, %r438;
$L__BB0_87:
setp.ge.u32 %p64, %r7, %r430;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r296, %r430, %r2, %r39;
mul.wide.s32 %rd101, %r296, 4;
add.s64 %rd103, %rd43, %rd101;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd103];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r430, 1;
setp.gt.u32 %p65, %r430, 3;
mov.u32 %r430, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r431, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r3, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
mov.b32 %r431, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f268, [%rd24];
ld.shared.f32 %f269, [%rd23];
add.f32 %f270, %f268, %f269;
st.shared.f32 [%rd23], %f270;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r432, %r438;
$L__BB0_98:
setp.ge.u32 %p70, %r7, %r432;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r298, %r432, %r2, %r39;
mul.wide.s32 %rd104, %r298, 4;
add.s64 %rd106, %rd43, %rd104;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd106];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r432, 1;
setp.gt.u32 %p71, %r432, 3;
mov.u32 %r432, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r433, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r3, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
mov.b32 %r433, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f276, [%rd24];
ld.shared.f32 %f277, [%rd23];
add.f32 %f278, %f276, %f277;
st.shared.f32 [%rd23], %f278;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r434, %r438;
$L__BB0_109:
setp.ge.u32 %p76, %r7, %r434;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r300, %r434, %r2, %r39;
mul.wide.s32 %rd107, %r300, 4;
add.s64 %rd109, %rd43, %rd107;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd109];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r434, 1;
setp.gt.u32 %p77, %r434, 3;
mov.u32 %r434, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r435, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r3, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
mov.b32 %r435, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f284, [%rd24];
ld.shared.f32 %f285, [%rd23];
add.f32 %f286, %f284, %f285;
st.shared.f32 [%rd23], %f286;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r436, %r438;
$L__BB0_120:
setp.ge.u32 %p82, %r7, %r436;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r302, %r436, %r2, %r39;
mul.wide.s32 %rd110, %r302, 4;
add.s64 %rd112, %rd43, %rd110;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd112];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r436, 1;
setp.gt.u32 %p83, %r436, 3;
mov.u32 %r436, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r437, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
mov.b32 %r437, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f292, [%rd24];
ld.shared.f32 %f293, [%rd23];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd23], %f294;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r7, %r438;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r304, %r438, %r2, %r39;
mul.wide.s32 %rd113, %r304, 4;
add.s64 %rd115, %rd43, %rd113;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd115];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r438, 1;
setp.gt.u32 %p89, %r438, 3;
mov.u32 %r438, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r439, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
mov.b32 %r439, %f377;
$L__BB0_137:
setp.eq.s32 %p141, %r7, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
shl.b32 %r414, %r5, 2;
mov.u32 %r314, %ctaid.y;
mad.lo.s32 %r315, %r161, %r314, %r414;
mul.wide.s32 %rd118, %r315, 4;
add.s64 %rd116, %rd40, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd116], {%r425,%r427,%r429,%r431};
// end inline asm
add.s64 %rd117, %rd41, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd117], {%r433,%r435,%r437,%r439};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r316, %r5, %r7;
or.b32 %r318, %r316, %r278;
setp.ne.s32 %p92, %r318, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd155, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd119, %rd155;
mov.u32 %r319, %ctaid.x;
mov.u32 %r320, %ctaid.z;
mov.u32 %r321, %nctaid.x;
mad.lo.s32 %r322, %r320, %r321, %r319;
mul.wide.s32 %rd120, %r322, 8;
add.s64 %rd28, %rd119, %rd120;
add.s32 %r323, %r8, -1;
setp.eq.s32 %p93, %r74, %r323;
cvt.s64.s32 %rd121, %r8;
mov.u64 %rd122, -9223372036854775807;
sub.s64 %rd123, %rd122, %rd121;
selp.b64 %rd124, %rd123, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd124;
ld.volatile.global.u64 %rd125, [%rd28];
xor.b64 %rd126, %rd125, %rd29;
setp.lt.s64 %p94, %rd126, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r440, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r440;
// end inline asm
setp.lt.u32 %p95, %r440, 256;
selp.u32 %r326, 1, 0, %p95;
shl.b32 %r440, %r440, %r326;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.gt.s64 %p96, %rd128, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r327, %r8, %r2;
add.s32 %r328, %r327, -1;
div.s32 %r77, %r328, %r2;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p97 bra $L__BB0_149;
add.s32 %r330, %r161, 1;
shr.u32 %r331, %r330, 31;
add.s32 %r332, %r330, %r331;
shr.s32 %r333, %r332, 1;
add.s32 %r334, %r3, %r333;
add.s32 %r335, %r334, -1;
shl.b32 %r336, %r7, 1;
shl.b32 %r337, %r3, 1;
mad.lo.s32 %r338, %r337, %r74, %r336;
or.b32 %r339, %r338, 1;
setp.ge.s32 %p98, %r339, %r161;
div.s32 %r340, %r335, %r3;
setp.ge.s32 %p99, %r74, %r340;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r341, %r3, %r74;
shl.b32 %r342, %r341, 1;
mad.lo.s32 %r343, %r161, %r5, %r342;
add.s32 %r442, %r343, %r336;
mul.lo.s32 %r79, %r161, %r2;
mov.u32 %r329, 0;
mov.f32 %f380, 0f00000000;
mov.u32 %r441, %r5;
mov.u32 %r443, %r329;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r444, %r329;
mov.u32 %r445, %r329;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r441, %r8;
mov.u32 %r444, %r329;
mov.u32 %r445, %r329;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd130, %r442, 4;
add.s64 %rd129, %rd41, %rd130;
// begin inline asm
ld.volatile.global.v2.s32 {%r445,%r444}, [%rd129];
// end inline asm
$L__BB0_148:
mov.b32 %f304, %r445;
add.f32 %f380, %f380, %f304;
mov.b32 %f305, %r444;
add.f32 %f381, %f381, %f305;
add.s32 %r442, %r442, %r79;
add.s32 %r441, %r441, %r2;
add.s32 %r443, %r443, 1;
setp.lt.s32 %p101, %r443, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r350, %r2;
mov.u32 %r351, 31;
sub.s32 %r352, %r351, %r350;
mov.u32 %r353, 1;
shl.b32 %r90, %r353, %r352;
setp.lt.u32 %p102, %r5, %r90;
add.s32 %r354, %r90, %r5;
setp.lt.u32 %p103, %r354, %r2;
and.pred %p7, %p102, %p103;
add.s32 %r355, %r39, %r90;
mul.wide.s32 %rd131, %r355, 4;
add.s64 %rd30, %rd43, %rd131;
shr.u32 %r356, %r90, 31;
add.s32 %r357, %r90, %r356;
shr.s32 %r457, %r357, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f306, [%rd30];
ld.shared.f32 %f307, [%rd23];
add.f32 %f308, %f306, %f307;
st.shared.f32 [%rd23], %f308;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r446, %r457;
$L__BB0_153:
setp.ge.u32 %p106, %r5, %r446;
@%p106 bra $L__BB0_155;
add.s32 %r358, %r446, %r39;
mul.wide.s32 %rd133, %r358, 4;
add.s64 %rd135, %rd43, %rd133;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd135];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r446, 1;
setp.gt.u32 %p107, %r446, 3;
mov.u32 %r446, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r360, %r39, 1;
mul.wide.u32 %rd136, %r360, 4;
add.s64 %rd31, %rd43, %rd136;
setp.ne.s32 %p108, %r5, 0;
mov.u32 %r447, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r2, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_159:
mov.b32 %r447, %f382;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f314, [%rd30];
ld.shared.f32 %f315, [%rd23];
add.f32 %f316, %f314, %f315;
st.shared.f32 [%rd23], %f316;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r448, %r457;
$L__BB0_164:
setp.ge.u32 %p112, %r5, %r448;
@%p112 bra $L__BB0_166;
add.s32 %r361, %r448, %r39;
mul.wide.s32 %rd138, %r361, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd140];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r448, 1;
setp.gt.u32 %p113, %r448, 3;
mov.u32 %r448, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r449, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r2, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_170:
mov.b32 %r449, %f383;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
add.s32 %r363, %r161, 1;
shr.u32 %r364, %r363, 31;
add.s32 %r365, %r363, %r364;
shr.s32 %r366, %r365, 1;
add.s32 %r367, %r3, %r366;
add.s32 %r368, %r367, -1;
div.s32 %r369, %r368, %r3;
setp.ge.s32 %p117, %r74, %r369;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r7, 1;
mul.lo.s32 %r370, %r3, %r74;
shl.b32 %r101, %r370, 1;
add.s32 %r371, %r100, %r101;
or.b32 %r372, %r371, 1;
setp.ge.s32 %p118, %r372, %r161;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd154, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r375, %r101, %r100;
mul.wide.s32 %rd142, %r375, 4;
add.s64 %rd141, %rd154, %rd142;
// begin inline asm
st.global.cs.v2.s32 [%rd141], {%r447,%r449};
// end inline asm
$L__BB0_175:
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p97 bra $L__BB0_181;
add.s32 %r377, %r161, 1;
shr.u32 %r378, %r377, 31;
add.s32 %r379, %r377, %r378;
shr.s32 %r380, %r379, 1;
add.s32 %r381, %r3, %r380;
add.s32 %r382, %r381, -1;
shl.b32 %r383, %r7, 1;
shl.b32 %r384, %r3, 1;
mad.lo.s32 %r385, %r384, %r74, %r383;
or.b32 %r386, %r385, 1;
setp.ge.s32 %p120, %r386, %r161;
div.s32 %r387, %r382, %r3;
setp.ge.s32 %p121, %r74, %r387;
or.pred %p8, %p121, %p120;
mul.lo.s32 %r388, %r3, %r74;
shl.b32 %r389, %r388, 1;
mad.lo.s32 %r390, %r161, %r5, %r389;
add.s32 %r451, %r390, %r383;
mul.lo.s32 %r103, %r161, %r2;
mov.u32 %r376, 0;
mov.f32 %f386, 0f00000000;
mov.u32 %r450, %r5;
mov.u32 %r452, %r376;
$L__BB0_177:
.pragma "nounroll";
mov.u32 %r453, %r376;
mov.u32 %r454, %r376;
@%p8 bra $L__BB0_180;
setp.ge.s32 %p122, %r450, %r8;
mov.u32 %r453, %r376;
mov.u32 %r454, %r376;
@%p122 bra $L__BB0_180;
mul.wide.s32 %rd144, %r451, 4;
add.s64 %rd143, %rd40, %rd144;
// begin inline asm
ld.volatile.global.v2.s32 {%r454,%r453}, [%rd143];
// end inline asm
$L__BB0_180:
mov.b32 %f326, %r454;
add.f32 %f386, %f386, %f326;
mov.b32 %f327, %r453;
add.f32 %f387, %f387, %f327;
add.s32 %r451, %r451, %r103;
add.s32 %r450, %r450, %r2;
add.s32 %r452, %r452, 1;
setp.lt.s32 %p123, %r452, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@%p104 bra $L__BB0_183;
ld.shared.f32 %f328, [%rd30];
ld.shared.f32 %f329, [%rd23];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd23], %f330;
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
mov.u32 %r455, %r457;
$L__BB0_185:
setp.ge.u32 %p126, %r5, %r455;
@%p126 bra $L__BB0_187;
add.s32 %r397, %r455, %r39;
mul.wide.s32 %rd145, %r397, 4;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd147];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_187:
bar.sync 0;
shr.u32 %r115, %r455, 1;
setp.gt.u32 %p127, %r455, 3;
mov.u32 %r455, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
mov.u32 %r456, 0;
@%p108 bra $L__BB0_192;
setp.lt.u32 %p129, %r2, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_191:
mov.b32 %r456, %f388;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@%p104 bra $L__BB0_194;
ld.shared.f32 %f336, [%rd30];
ld.shared.f32 %f337, [%rd23];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd23], %f338;
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
setp.ge.u32 %p132, %r5, %r457;
@%p132 bra $L__BB0_197;
add.s32 %r399, %r457, %r39;
mul.wide.s32 %rd148, %r399, 4;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd150];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_197:
bar.sync 0;
shr.u32 %r119, %r457, 1;
setp.gt.u32 %p133, %r457, 3;
mov.u32 %r457, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
mov.u32 %r458, 0;
@%p108 bra $L__BB0_202;
setp.lt.u32 %p135, %r2, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_201:
mov.b32 %r458, %f389;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
add.s32 %r401, %r161, 1;
shr.u32 %r402, %r401, 31;
add.s32 %r403, %r401, %r402;
shr.s32 %r404, %r403, 1;
add.s32 %r405, %r3, %r404;
add.s32 %r406, %r405, -1;
div.s32 %r407, %r406, %r3;
setp.ge.s32 %p137, %r74, %r407;
@%p137 bra $L__BB0_206;
shl.b32 %r122, %r7, 1;
mul.lo.s32 %r408, %r3, %r74;
shl.b32 %r123, %r408, 1;
add.s32 %r409, %r122, %r123;
or.b32 %r410, %r409, 1;
setp.ge.s32 %p138, %r410, %r161;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd153, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_f4023a39_1033910nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r413, %r123, %r122;
mul.wide.s32 %rd152, %r413, 4;
add.s64 %rd151, %rd153, %rd152;
// begin inline asm
st.global.cs.v2.s32 [%rd151], {%r456,%r458};
// end inline asm
$L__BB0_206:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<390>;
.reg .b32 %r<457>;
.reg .f64 %fd<3>;
.reg .b64 %rd<156>;
ld.param.v2.u32 {%r160, %r161}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r170, %r171}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r174, %r175}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r196, %r161, 3;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 30;
add.s32 %r199, %r196, %r198;
shr.s32 %r2, %r199, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r200, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r201, %r4, 2;
mad.lo.s32 %r202, %r201, %r200, 15;
and.b32 %r203, %r202, -16;
cvt.u64.u32 %rd1, %r203;
mul.lo.s32 %r204, %r4, %r2;
shl.b32 %r205, %r204, 4;
or.b32 %r206, %r205, 15;
and.b32 %r5, %r206, -16;
add.s32 %r207, %r206, %r5;
and.b32 %r208, %r207, -16;
cvt.s64.s32 %rd2, %r208;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r209, %r7, 3;
setp.lt.s32 %p10, %r209, %r161;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r210, smem_ptr; }
// end inline asm
shl.b32 %r213, %r6, 4;
add.s32 %r211, %r210, %r213;
mul.wide.s32 %rd47, %r7, 4;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r212, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r212, 0;
cp.async.ca.shared.global [%r211], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r214, %r4, 215;
div.s32 %r215, %r214, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r216, %r9, %r215;
add.s32 %r217, %r216, -1;
div.s32 %r10, %r217, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r161;
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r219, %ctaid.y;
mul.lo.s32 %r220, %r10, %r4;
mul.lo.s32 %r11, %r220, %r219;
mad.lo.s32 %r221, %r2, %r8, %r6;
shl.b32 %r12, %r221, 4;
mul.lo.s32 %r222, %r161, %r8;
cvt.s64.s32 %rd52, %r222;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r223, %r11, %r161;
cvt.s64.s32 %rd6, %r223;
mul.lo.s32 %r13, %r161, %r4;
mul.lo.s32 %r14, %r10, %r219;
shl.b32 %r224, %r8, 2;
mad.lo.s32 %r225, %r224, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r225, 4;
add.s64 %rd7, %rd54, %rd55;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r226, %tid.z;
mad.lo.s32 %r227, %r4, %r226, %r8;
mad.lo.s32 %r15, %r227, %r3, %r6;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r228, %r3;
mov.u32 %r229, 31;
sub.s32 %r230, %r229, %r228;
mov.u32 %r231, 1;
shl.b32 %r16, %r231, %r230;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r232, %r16, %r6;
setp.lt.u32 %p15, %r232, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r233, %r15, %r16;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r234, %r16, 31;
add.s32 %r235, %r16, %r234;
shr.s32 %r17, %r235, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r236, %r15, 1;
mul.wide.u32 %rd58, %r236, 4;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r227, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
mov.u32 %r415, 0;
mov.f32 %f354, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r239, smem_ptr; }
// end inline asm
add.s32 %r240, %r239, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r265, smem_ptr; }
// end inline asm
add.s32 %r266, %r265, %r12;
not.pred %p26, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r237, %r415, %r4, %r8;
add.s32 %r238, %r237, %r11;
setp.gt.s32 %p17, %r238, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r242, %r13, %r415;
cvt.s64.s32 %rd65, %r242;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r241, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r241, 0;
cp.async.ca.shared.global [%r240], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r243, %r14, %r415;
mad.lo.s32 %r244, %r243, %r4, %r8;
setp.lt.s32 %p19, %r244, 216;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r253, %r14, %r415;
mad.lo.s32 %r254, %r253, %r4, %r8;
setp.gt.s32 %p20, %r254, 215;
mov.u32 %r416, 0;
mov.u32 %r417, %r416;
mov.u32 %r418, %r416;
mov.u32 %r419, %r416;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r416, 0;
mov.u32 %r417, %r416;
mov.u32 %r418, %r416;
mov.u32 %r419, %r416;
$L__BB0_15:
add.s32 %r263, %r14, %r415;
mad.lo.s32 %r33, %r263, %r4, %r8;
mov.b32 %f112, %r419;
add.f32 %f369, %f369, %f112;
mov.b32 %f113, %r418;
add.f32 %f368, %f368, %f113;
mov.b32 %f114, %r417;
add.f32 %f367, %f367, %f114;
mov.b32 %f115, %r416;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p21, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r264, %r33, %r170;
mul.wide.s32 %rd69, %r264, 4;
add.s64 %rd70, %rd16, %rd69;
ld.global.f32 %f352, [%rd70];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r268, %r13, %r415;
cvt.s64.s32 %rd73, %r268;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r267, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r267, 0;
cp.async.ca.shared.global [%r266], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r414, %r14, %r415;
mad.lo.s32 %r413, %r414, %r4, %r8;
setp.gt.s32 %p142, %r413, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r269, %r33, %r174;
mul.wide.s32 %rd77, %r269, 4;
add.s64 %rd78, %rd17, %rd77;
ld.global.f32 %f353, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f359, %f358;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd7];
mul.f32 %f129, %f119, %f124;
add.f32 %f130, %f129, 0f00000000;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
sub.f32 %f136, %f131, %f352;
mul.f32 %f137, %f353, %f136;
fma.rn.f32 %f138, %f129, %f137, 0f00000000;
fma.rn.f32 %f354, %f137, %f124, %f354;
mul.f32 %f141, %f120, %f125;
add.f32 %f142, %f130, %f141;
sub.f32 %f144, %f132, %f352;
mul.f32 %f145, %f353, %f144;
fma.rn.f32 %f146, %f141, %f145, %f138;
fma.rn.f32 %f355, %f145, %f125, %f355;
mul.f32 %f149, %f121, %f126;
add.f32 %f150, %f142, %f149;
sub.f32 %f152, %f133, %f352;
mul.f32 %f153, %f353, %f152;
fma.rn.f32 %f154, %f149, %f153, %f146;
fma.rn.f32 %f356, %f153, %f126, %f356;
mul.f32 %f157, %f122, %f127;
add.f32 %f359, %f150, %f157;
sub.f32 %f159, %f134, %f352;
mul.f32 %f160, %f353, %f159;
fma.rn.f32 %f358, %f157, %f160, %f154;
fma.rn.f32 %f357, %f160, %f127, %f357;
$L__BB0_23:
st.shared.f32 [%rd8], %f359;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r420, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r6, %r420;
@%p28 bra $L__BB0_29;
add.s32 %r270, %r420, %r15;
mul.wide.s32 %rd79, %r270, 4;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd81];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r420, 1;
setp.gt.u32 %p29, %r420, 3;
mov.u32 %r420, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r6, 0;
mov.f32 %f360, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f360, %f360, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f358;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
mov.u32 %r421, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r6, %r421;
@%p34 bra $L__BB0_39;
add.s32 %r271, %r421, %r15;
mul.wide.s32 %rd82, %r271, 4;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd84];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r421, 1;
setp.gt.u32 %p35, %r421, 3;
mov.u32 %r421, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f361, %f361, %f178;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f360;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f361;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f179, %f353, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd12];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd10];
sub.f32 %f197, %f192, %f352;
mul.f32 %f198, %f353, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r272, %f202;
mul.f32 %f205, %f181, %f186;
mul.f32 %f206, %f205, %f2;
sub.f32 %f208, %f193, %f352;
mul.f32 %f209, %f353, %f208;
sub.f32 %f210, %f206, %f37;
mul.f32 %f211, %f38, %f209;
sub.f32 %f212, %f210, %f211;
mul.f32 %f213, %f179, %f212;
mov.b32 %r273, %f213;
mul.f32 %f216, %f182, %f187;
mul.f32 %f217, %f216, %f2;
sub.f32 %f219, %f194, %f352;
mul.f32 %f220, %f353, %f219;
sub.f32 %f221, %f217, %f37;
mul.f32 %f222, %f38, %f220;
sub.f32 %f223, %f221, %f222;
mul.f32 %f224, %f179, %f223;
mov.b32 %r274, %f224;
mul.f32 %f227, %f183, %f188;
mul.f32 %f228, %f227, %f2;
sub.f32 %f230, %f195, %f352;
mul.f32 %f231, %f353, %f230;
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r275, %f235;
mad.lo.s32 %r276, %r33, %r161, %r7;
mul.wide.s32 %rd86, %r276, 4;
add.s64 %rd85, %rd37, %rd86;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r272,%r273,%r274,%r275};
// end inline asm
$L__BB0_49:
add.s32 %r415, %r415, 1;
setp.lt.s32 %p41, %r415, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
mov.u32 %r277, %tid.z;
mad.lo.s32 %r278, %r4, %r277, %r8;
mad.lo.s32 %r39, %r278, %r3, %r6;
mul.wide.u32 %rd87, %r39, 4;
add.s64 %rd23, %rd43, %rd87;
clz.b32 %r279, %r4;
mov.u32 %r280, 31;
sub.s32 %r281, %r280, %r279;
mov.u32 %r282, 1;
shl.b32 %r40, %r282, %r281;
setp.lt.u32 %p42, %r8, %r40;
add.s32 %r283, %r40, %r8;
setp.lt.u32 %p43, %r283, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r284, %r3, %r281;
add.s32 %r285, %r39, %r284;
mul.wide.s32 %rd89, %r285, 4;
add.s64 %rd24, %rd43, %rd89;
shr.u32 %r286, %r40, 31;
add.s32 %r287, %r40, %r286;
shr.s32 %r436, %r287, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f236, [%rd24];
ld.shared.f32 %f237, [%rd23];
add.f32 %f238, %f236, %f237;
st.shared.f32 [%rd23], %f238;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r422, %r436;
$L__BB0_54:
setp.ge.u32 %p46, %r8, %r422;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r288, %r422, %r3, %r39;
mul.wide.s32 %rd90, %r288, 4;
add.s64 %rd92, %rd43, %rd90;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd92];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r422, 1;
setp.gt.u32 %p47, %r422, 3;
mov.u32 %r422, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r290, %r39, %r3;
mul.wide.u32 %rd93, %r290, 4;
add.s64 %rd25, %rd43, %rd93;
setp.ne.s32 %p48, %r8, 0;
mov.u32 %r423, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
mov.b32 %r423, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f244, [%rd24];
ld.shared.f32 %f245, [%rd23];
add.f32 %f246, %f244, %f245;
st.shared.f32 [%rd23], %f246;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r424, %r436;
$L__BB0_65:
setp.ge.u32 %p52, %r8, %r424;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r291, %r424, %r3, %r39;
mul.wide.s32 %rd95, %r291, 4;
add.s64 %rd97, %rd43, %rd95;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd97];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r424, 1;
setp.gt.u32 %p53, %r424, 3;
mov.u32 %r424, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r425, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
mov.b32 %r425, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f252, [%rd24];
ld.shared.f32 %f253, [%rd23];
add.f32 %f254, %f252, %f253;
st.shared.f32 [%rd23], %f254;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r426, %r436;
$L__BB0_76:
setp.ge.u32 %p58, %r8, %r426;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r293, %r426, %r3, %r39;
mul.wide.s32 %rd98, %r293, 4;
add.s64 %rd100, %rd43, %rd98;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd100];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r426, 1;
setp.gt.u32 %p59, %r426, 3;
mov.u32 %r426, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r427, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
mov.b32 %r427, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f260, [%rd24];
ld.shared.f32 %f261, [%rd23];
add.f32 %f262, %f260, %f261;
st.shared.f32 [%rd23], %f262;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r428, %r436;
$L__BB0_87:
setp.ge.u32 %p64, %r8, %r428;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r295, %r428, %r3, %r39;
mul.wide.s32 %rd101, %r295, 4;
add.s64 %rd103, %rd43, %rd101;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd103];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r428, 1;
setp.gt.u32 %p65, %r428, 3;
mov.u32 %r428, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r429, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
mov.b32 %r429, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f268, [%rd24];
ld.shared.f32 %f269, [%rd23];
add.f32 %f270, %f268, %f269;
st.shared.f32 [%rd23], %f270;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r430, %r436;
$L__BB0_98:
setp.ge.u32 %p70, %r8, %r430;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r297, %r430, %r3, %r39;
mul.wide.s32 %rd104, %r297, 4;
add.s64 %rd106, %rd43, %rd104;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd106];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r430, 1;
setp.gt.u32 %p71, %r430, 3;
mov.u32 %r430, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r431, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
mov.b32 %r431, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f276, [%rd24];
ld.shared.f32 %f277, [%rd23];
add.f32 %f278, %f276, %f277;
st.shared.f32 [%rd23], %f278;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r432, %r436;
$L__BB0_109:
setp.ge.u32 %p76, %r8, %r432;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r299, %r432, %r3, %r39;
mul.wide.s32 %rd107, %r299, 4;
add.s64 %rd109, %rd43, %rd107;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd109];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r432, 1;
setp.gt.u32 %p77, %r432, 3;
mov.u32 %r432, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r433, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
mov.b32 %r433, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f284, [%rd24];
ld.shared.f32 %f285, [%rd23];
add.f32 %f286, %f284, %f285;
st.shared.f32 [%rd23], %f286;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r434, %r436;
$L__BB0_120:
setp.ge.u32 %p82, %r8, %r434;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r301, %r434, %r3, %r39;
mul.wide.s32 %rd110, %r301, 4;
add.s64 %rd112, %rd43, %rd110;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd112];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r434, 1;
setp.gt.u32 %p83, %r434, 3;
mov.u32 %r434, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r435, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
mov.b32 %r435, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f292, [%rd24];
ld.shared.f32 %f293, [%rd23];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd23], %f294;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r8, %r436;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r303, %r436, %r3, %r39;
mul.wide.s32 %rd113, %r303, 4;
add.s64 %rd115, %rd43, %rd113;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd115];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r436, 1;
setp.gt.u32 %p89, %r436, 3;
mov.u32 %r436, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r437, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
mov.b32 %r437, %f377;
$L__BB0_137:
setp.eq.s32 %p141, %r8, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
mov.u32 %r313, %ctaid.y;
mad.lo.s32 %r314, %r161, %r313, %r7;
mul.wide.s32 %rd118, %r314, 4;
add.s64 %rd116, %rd40, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd116], {%r423,%r425,%r427,%r429};
// end inline asm
add.s64 %rd117, %rd41, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd117], {%r431,%r433,%r435,%r437};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r315, %r6, %r8;
or.b32 %r317, %r315, %r277;
setp.ne.s32 %p92, %r317, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd155, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd119, %rd155;
mov.u32 %r318, %ctaid.x;
mov.u32 %r319, %ctaid.z;
mov.u32 %r320, %nctaid.x;
mad.lo.s32 %r321, %r319, %r320, %r318;
mul.wide.s32 %rd120, %r321, 8;
add.s64 %rd28, %rd119, %rd120;
add.s32 %r322, %r9, -1;
setp.eq.s32 %p93, %r74, %r322;
cvt.s64.s32 %rd121, %r9;
mov.u64 %rd122, -9223372036854775807;
sub.s64 %rd123, %rd122, %rd121;
selp.b64 %rd124, %rd123, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd124;
ld.volatile.global.u64 %rd125, [%rd28];
xor.b64 %rd126, %rd125, %rd29;
setp.lt.s64 %p94, %rd126, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r438, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r438;
// end inline asm
setp.lt.u32 %p95, %r438, 256;
selp.u32 %r325, 1, 0, %p95;
shl.b32 %r438, %r438, %r325;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.gt.s64 %p96, %rd128, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r326, %r9, %r3;
add.s32 %r327, %r326, -1;
div.s32 %r77, %r327, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p97 bra $L__BB0_149;
add.s32 %r329, %r161, 1;
shr.u32 %r330, %r329, 31;
add.s32 %r331, %r329, %r330;
shr.s32 %r332, %r331, 1;
add.s32 %r333, %r4, %r332;
add.s32 %r334, %r333, -1;
shl.b32 %r335, %r8, 1;
shl.b32 %r336, %r4, 1;
mad.lo.s32 %r337, %r336, %r74, %r335;
or.b32 %r338, %r337, 1;
setp.ge.s32 %p98, %r338, %r161;
div.s32 %r339, %r334, %r4;
setp.ge.s32 %p99, %r74, %r339;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r340, %r4, %r74;
shl.b32 %r341, %r340, 1;
mad.lo.s32 %r342, %r161, %r6, %r341;
add.s32 %r440, %r342, %r335;
mul.lo.s32 %r79, %r161, %r3;
mov.u32 %r328, 0;
mov.f32 %f380, 0f00000000;
mov.u32 %r439, %r6;
mov.u32 %r441, %r328;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r442, %r328;
mov.u32 %r443, %r328;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r439, %r9;
mov.u32 %r442, %r328;
mov.u32 %r443, %r328;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd130, %r440, 4;
add.s64 %rd129, %rd41, %rd130;
// begin inline asm
ld.volatile.global.v2.s32 {%r443,%r442}, [%rd129];
// end inline asm
$L__BB0_148:
mov.b32 %f304, %r443;
add.f32 %f380, %f380, %f304;
mov.b32 %f305, %r442;
add.f32 %f381, %f381, %f305;
add.s32 %r440, %r440, %r79;
add.s32 %r439, %r439, %r3;
add.s32 %r441, %r441, 1;
setp.lt.s32 %p101, %r441, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r349, %r3;
mov.u32 %r350, 31;
sub.s32 %r351, %r350, %r349;
mov.u32 %r352, 1;
shl.b32 %r90, %r352, %r351;
setp.lt.u32 %p102, %r6, %r90;
add.s32 %r353, %r90, %r6;
setp.lt.u32 %p103, %r353, %r3;
and.pred %p7, %p102, %p103;
add.s32 %r354, %r39, %r90;
mul.wide.s32 %rd131, %r354, 4;
add.s64 %rd30, %rd43, %rd131;
shr.u32 %r355, %r90, 31;
add.s32 %r356, %r90, %r355;
shr.s32 %r455, %r356, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f306, [%rd30];
ld.shared.f32 %f307, [%rd23];
add.f32 %f308, %f306, %f307;
st.shared.f32 [%rd23], %f308;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r444, %r455;
$L__BB0_153:
setp.ge.u32 %p106, %r6, %r444;
@%p106 bra $L__BB0_155;
add.s32 %r357, %r444, %r39;
mul.wide.s32 %rd133, %r357, 4;
add.s64 %rd135, %rd43, %rd133;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd135];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r444, 1;
setp.gt.u32 %p107, %r444, 3;
mov.u32 %r444, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r359, %r39, 1;
mul.wide.u32 %rd136, %r359, 4;
add.s64 %rd31, %rd43, %rd136;
setp.ne.s32 %p108, %r6, 0;
mov.u32 %r445, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_159:
mov.b32 %r445, %f382;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f314, [%rd30];
ld.shared.f32 %f315, [%rd23];
add.f32 %f316, %f314, %f315;
st.shared.f32 [%rd23], %f316;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r446, %r455;
$L__BB0_164:
setp.ge.u32 %p112, %r6, %r446;
@%p112 bra $L__BB0_166;
add.s32 %r360, %r446, %r39;
mul.wide.s32 %rd138, %r360, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd140];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r446, 1;
setp.gt.u32 %p113, %r446, 3;
mov.u32 %r446, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r447, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_170:
mov.b32 %r447, %f383;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
add.s32 %r362, %r161, 1;
shr.u32 %r363, %r362, 31;
add.s32 %r364, %r362, %r363;
shr.s32 %r365, %r364, 1;
add.s32 %r366, %r4, %r365;
add.s32 %r367, %r366, -1;
div.s32 %r368, %r367, %r4;
setp.ge.s32 %p117, %r74, %r368;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r8, 1;
mul.lo.s32 %r369, %r4, %r74;
shl.b32 %r101, %r369, 1;
add.s32 %r370, %r100, %r101;
or.b32 %r371, %r370, 1;
setp.ge.s32 %p118, %r371, %r161;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd154, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r374, %r101, %r100;
mul.wide.s32 %rd142, %r374, 4;
add.s64 %rd141, %rd154, %rd142;
// begin inline asm
st.global.cs.v2.s32 [%rd141], {%r445,%r447};
// end inline asm
$L__BB0_175:
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p97 bra $L__BB0_181;
add.s32 %r376, %r161, 1;
shr.u32 %r377, %r376, 31;
add.s32 %r378, %r376, %r377;
shr.s32 %r379, %r378, 1;
add.s32 %r380, %r4, %r379;
add.s32 %r381, %r380, -1;
shl.b32 %r382, %r8, 1;
shl.b32 %r383, %r4, 1;
mad.lo.s32 %r384, %r383, %r74, %r382;
or.b32 %r385, %r384, 1;
setp.ge.s32 %p120, %r385, %r161;
div.s32 %r386, %r381, %r4;
setp.ge.s32 %p121, %r74, %r386;
or.pred %p8, %p121, %p120;
mul.lo.s32 %r387, %r4, %r74;
shl.b32 %r388, %r387, 1;
mad.lo.s32 %r389, %r161, %r6, %r388;
add.s32 %r449, %r389, %r382;
mul.lo.s32 %r103, %r161, %r3;
mov.u32 %r375, 0;
mov.f32 %f386, 0f00000000;
mov.u32 %r448, %r6;
mov.u32 %r450, %r375;
$L__BB0_177:
.pragma "nounroll";
mov.u32 %r451, %r375;
mov.u32 %r452, %r375;
@%p8 bra $L__BB0_180;
setp.ge.s32 %p122, %r448, %r9;
mov.u32 %r451, %r375;
mov.u32 %r452, %r375;
@%p122 bra $L__BB0_180;
mul.wide.s32 %rd144, %r449, 4;
add.s64 %rd143, %rd40, %rd144;
// begin inline asm
ld.volatile.global.v2.s32 {%r452,%r451}, [%rd143];
// end inline asm
$L__BB0_180:
mov.b32 %f326, %r452;
add.f32 %f386, %f386, %f326;
mov.b32 %f327, %r451;
add.f32 %f387, %f387, %f327;
add.s32 %r449, %r449, %r103;
add.s32 %r448, %r448, %r3;
add.s32 %r450, %r450, 1;
setp.lt.s32 %p123, %r450, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@%p104 bra $L__BB0_183;
ld.shared.f32 %f328, [%rd30];
ld.shared.f32 %f329, [%rd23];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd23], %f330;
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
mov.u32 %r453, %r455;
$L__BB0_185:
setp.ge.u32 %p126, %r6, %r453;
@%p126 bra $L__BB0_187;
add.s32 %r396, %r453, %r39;
mul.wide.s32 %rd145, %r396, 4;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd147];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_187:
bar.sync 0;
shr.u32 %r115, %r453, 1;
setp.gt.u32 %p127, %r453, 3;
mov.u32 %r453, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
mov.u32 %r454, 0;
@%p108 bra $L__BB0_192;
setp.lt.u32 %p129, %r3, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_191:
mov.b32 %r454, %f388;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@%p104 bra $L__BB0_194;
ld.shared.f32 %f336, [%rd30];
ld.shared.f32 %f337, [%rd23];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd23], %f338;
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
setp.ge.u32 %p132, %r6, %r455;
@%p132 bra $L__BB0_197;
add.s32 %r398, %r455, %r39;
mul.wide.s32 %rd148, %r398, 4;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd150];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_197:
bar.sync 0;
shr.u32 %r119, %r455, 1;
setp.gt.u32 %p133, %r455, 3;
mov.u32 %r455, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
mov.u32 %r456, 0;
@%p108 bra $L__BB0_202;
setp.lt.u32 %p135, %r3, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_201:
mov.b32 %r456, %f389;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
add.s32 %r400, %r161, 1;
shr.u32 %r401, %r400, 31;
add.s32 %r402, %r400, %r401;
shr.s32 %r403, %r402, 1;
add.s32 %r404, %r4, %r403;
add.s32 %r405, %r404, -1;
div.s32 %r406, %r405, %r4;
setp.ge.s32 %p137, %r74, %r406;
@%p137 bra $L__BB0_206;
shl.b32 %r122, %r8, 1;
mul.lo.s32 %r407, %r4, %r74;
shl.b32 %r123, %r407, 1;
add.s32 %r408, %r122, %r123;
or.b32 %r409, %r408, 1;
setp.ge.s32 %p138, %r409, %r161;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd153, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_15_cu_113f13a3_723310nvfuser_15ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r412, %r123, %r122;
mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd151, %rd153, %rd152;
// begin inline asm
st.global.cs.v2.s32 [%rd151], {%r454,%r456};
// end inline asm
$L__BB0_206:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -30,11 +30,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<390>;
- .reg .b32 %r<459>;
+ .reg .b32 %r<457>;
.reg .f64 %fd<3>;
.reg .b64 %rd<156>;
ld.param.v2.u32 {%r160, %r161}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
@@ -50,110 +50,110 @@
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r196, %r161, 3;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 30;
add.s32 %r199, %r196, %r198;
- shr.s32 %r200, %r199, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r201, %r200, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r202, %r3, 2;
- mad.lo.s32 %r203, %r202, %r201, 15;
- and.b32 %r204, %r203, -16;
- cvt.u64.u32 %rd1, %r204;
- mul.lo.s32 %r205, %r3, %r200;
- shl.b32 %r206, %r205, 4;
- or.b32 %r207, %r206, 15;
- and.b32 %r4, %r207, -16;
- add.s32 %r208, %r207, %r4;
- and.b32 %r209, %r208, -16;
- cvt.s64.s32 %rd2, %r209;
+ shr.s32 %r2, %r199, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r200, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r201, %r4, 2;
+ mad.lo.s32 %r202, %r201, %r200, 15;
+ and.b32 %r203, %r202, -16;
+ cvt.u64.u32 %rd1, %r203;
+ mul.lo.s32 %r204, %r4, %r2;
+ shl.b32 %r205, %r204, 4;
+ or.b32 %r206, %r205, 15;
+ and.b32 %r5, %r206, -16;
+ add.s32 %r207, %r206, %r5;
+ and.b32 %r208, %r207, -16;
+ cvt.s64.s32 %rd2, %r208;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p9, %r5, %r200;
- shl.b32 %r6, %r5, 2;
- or.b32 %r210, %r6, 3;
- setp.lt.s32 %p10, %r210, %r161;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p9, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r209, %r7, 3;
+ setp.lt.s32 %p10, %r209, %r161;
and.pred %p1, %p10, %p9;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p11, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r211, smem_ptr; }
-
-
- shl.b32 %r214, %r5, 4;
- add.s32 %r212, %r211, %r214;
- mul.wide.s32 %rd47, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r210, smem_ptr; }
+
+
+ shl.b32 %r213, %r6, 4;
+ add.s32 %r211, %r210, %r213;
+ mul.wide.s32 %rd47, %r7, 4;
add.s64 %rd46, %rd36, %rd47;
- mov.u32 %r213, 0;
+ mov.u32 %r212, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r213, 0;
- cp.async.ca.shared.global [%r212], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r212, 0;
+ cp.async.ca.shared.global [%r211], [%rd46], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r215, %r3, 215;
- div.s32 %r216, %r215, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r217, %r8, %r216;
- add.s32 %r218, %r217, -1;
- div.s32 %r9, %r218, %r8;
- setp.gt.s32 %p13, %r9, 0;
+ add.s32 %r214, %r4, 215;
+ div.s32 %r215, %r214, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r216, %r9, %r215;
+ add.s32 %r217, %r216, -1;
+ div.s32 %r10, %r217, %r9;
+ setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r161;
- cvt.s64.s32 %rd48, %r4;
+ cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
- mov.u32 %r220, %ctaid.y;
- mul.lo.s32 %r221, %r9, %r3;
- mul.lo.s32 %r10, %r221, %r220;
- shl.b32 %r222, %r7, 2;
- shl.b32 %r223, %r5, 4;
- mad.lo.s32 %r11, %r222, %r161, %r223;
- mul.lo.s32 %r224, %r161, %r7;
- cvt.s64.s32 %rd52, %r224;
- cvt.s64.s32 %rd53, %r6;
+ mov.u32 %r219, %ctaid.y;
+ mul.lo.s32 %r220, %r10, %r4;
+ mul.lo.s32 %r11, %r220, %r219;
+ mad.lo.s32 %r221, %r2, %r8, %r6;
+ shl.b32 %r12, %r221, 4;
+ mul.lo.s32 %r222, %r161, %r8;
+ cvt.s64.s32 %rd52, %r222;
+ cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r225, %r10, %r161;
- cvt.s64.s32 %rd6, %r225;
- mul.lo.s32 %r12, %r161, %r3;
- mul.lo.s32 %r13, %r9, %r220;
- add.s32 %r14, %r224, %r6;
+ mul.lo.s32 %r223, %r11, %r161;
+ cvt.s64.s32 %rd6, %r223;
+ mul.lo.s32 %r13, %r161, %r4;
+ mul.lo.s32 %r14, %r10, %r219;
+ shl.b32 %r224, %r8, 2;
+ mad.lo.s32 %r225, %r224, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
- mul.wide.s32 %rd55, %r14, 4;
+ mul.wide.s32 %rd55, %r225, 4;
add.s64 %rd7, %rd54, %rd55;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r226, %tid.z;
- mad.lo.s32 %r227, %r3, %r226, %r7;
- mad.lo.s32 %r15, %r227, %r2, %r5;
+ mad.lo.s32 %r227, %r4, %r226, %r8;
+ mad.lo.s32 %r15, %r227, %r3, %r6;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd8, %rd43, %rd56;
- clz.b32 %r228, %r2;
+ clz.b32 %r228, %r3;
mov.u32 %r229, 31;
sub.s32 %r230, %r229, %r228;
mov.u32 %r231, 1;
shl.b32 %r16, %r231, %r230;
- setp.lt.u32 %p14, %r5, %r16;
- add.s32 %r232, %r16, %r5;
- setp.lt.u32 %p15, %r232, %r2;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r232, %r16, %r6;
+ setp.lt.u32 %p15, %r232, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r233, %r15, %r16;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r234, %r16, 31;
@@ -162,31 +162,31 @@
add.s64 %rd10, %rd51, %rd55;
add.s32 %r236, %r15, 1;
mul.wide.u32 %rd58, %r236, 4;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
- mul.wide.s32 %rd60, %r6, 4;
+ mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r227, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
- mov.u32 %r417, 0;
+ mov.u32 %r415, 0;
mov.f32 %f354, 0f00000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r239, smem_ptr; }
- add.s32 %r240, %r11, %r239;
+ add.s32 %r240, %r239, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r265, smem_ptr; }
- add.s32 %r266, %r11, %r265;
+ add.s32 %r266, %r265, %r12;
not.pred %p26, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
@@ -196,16 +196,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r237, %r417, %r3, %r7;
- add.s32 %r238, %r237, %r10;
+ mad.lo.s32 %r237, %r415, %r4, %r8;
+ add.s32 %r238, %r237, %r11;
setp.gt.s32 %p17, %r238, 215;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r242, %r12, %r417;
+ mul.lo.s32 %r242, %r13, %r415;
cvt.s64.s32 %rd65, %r242;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd32, %rd68;
@@ -224,53 +224,53 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r243, %r13, %r417;
- mad.lo.s32 %r244, %r243, %r3, %r7;
+ add.s32 %r243, %r14, %r415;
+ mad.lo.s32 %r244, %r243, %r4, %r8;
setp.lt.s32 %p19, %r244, 216;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
+ ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r253, %r13, %r417;
- mad.lo.s32 %r254, %r253, %r3, %r7;
+ add.s32 %r253, %r14, %r415;
+ mad.lo.s32 %r254, %r253, %r4, %r8;
setp.gt.s32 %p20, %r254, 215;
- mov.u32 %r418, 0;
- mov.u32 %r419, %r418;
- mov.u32 %r420, %r418;
- mov.u32 %r421, %r418;
+ mov.u32 %r416, 0;
+ mov.u32 %r417, %r416;
+ mov.u32 %r418, %r416;
+ mov.u32 %r419, %r416;
@%p20 bra $L__BB0_15;
- ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
+ ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r418, 0;
- mov.u32 %r419, %r418;
- mov.u32 %r420, %r418;
- mov.u32 %r421, %r418;
+ mov.u32 %r416, 0;
+ mov.u32 %r417, %r416;
+ mov.u32 %r418, %r416;
+ mov.u32 %r419, %r416;
$L__BB0_15:
- add.s32 %r263, %r13, %r417;
- mad.lo.s32 %r33, %r263, %r3, %r7;
- mov.b32 %f112, %r421;
+ add.s32 %r263, %r14, %r415;
+ mad.lo.s32 %r33, %r263, %r4, %r8;
+ mov.b32 %f112, %r419;
add.f32 %f369, %f369, %f112;
- mov.b32 %f113, %r420;
+ mov.b32 %f113, %r418;
add.f32 %f368, %f368, %f113;
- mov.b32 %f114, %r419;
+ mov.b32 %f114, %r417;
add.f32 %f367, %f367, %f114;
- mov.b32 %f115, %r418;
+ mov.b32 %f115, %r416;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p21, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p21 bra $L__BB0_17;
@@ -283,11 +283,11 @@
setp.lt.s32 %p22, %r33, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
- mul.lo.s32 %r268, %r12, %r417;
+ mul.lo.s32 %r268, %r13, %r415;
cvt.s64.s32 %rd73, %r268;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd33, %rd76;
@@ -300,13 +300,13 @@
}
$L__BB0_19:
- add.s32 %r416, %r13, %r417;
- mad.lo.s32 %r415, %r416, %r3, %r7;
- setp.gt.s32 %p142, %r415, 215;
+ add.s32 %r414, %r14, %r415;
+ mad.lo.s32 %r413, %r414, %r4, %r8;
+ setp.gt.s32 %p142, %r413, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r269, %r33, %r174;
@@ -363,37 +363,37 @@
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
- mov.u32 %r422, %r17;
+ mov.u32 %r420, %r17;
$L__BB0_27:
- setp.ge.u32 %p28, %r5, %r422;
+ setp.ge.u32 %p28, %r6, %r420;
@%p28 bra $L__BB0_29;
- add.s32 %r270, %r422, %r15;
+ add.s32 %r270, %r420, %r15;
mul.wide.s32 %rd79, %r270, 4;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd81];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
- shr.u32 %r35, %r422, 1;
- setp.gt.u32 %p29, %r422, 3;
- mov.u32 %r422, %r35;
+ shr.u32 %r35, %r420, 1;
+ setp.gt.u32 %p29, %r420, 3;
+ mov.u32 %r420, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p30, %r5, 0;
+ setp.ne.s32 %p30, %r6, 0;
mov.f32 %f360, 0f00000000;
@%p30 bra $L__BB0_33;
- setp.lt.u32 %p31, %r2, 2;
+ setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
@@ -413,36 +413,36 @@
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
- mov.u32 %r423, %r17;
+ mov.u32 %r421, %r17;
$L__BB0_37:
- setp.ge.u32 %p34, %r5, %r423;
+ setp.ge.u32 %p34, %r6, %r421;
@%p34 bra $L__BB0_39;
- add.s32 %r271, %r423, %r15;
+ add.s32 %r271, %r421, %r15;
mul.wide.s32 %rd82, %r271, 4;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd84];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
- shr.u32 %r37, %r423, 1;
- setp.gt.u32 %p35, %r423, 3;
- mov.u32 %r423, %r37;
+ shr.u32 %r37, %r421, 1;
+ setp.gt.u32 %p35, %r421, 3;
+ mov.u32 %r421, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p30 bra $L__BB0_43;
- setp.lt.u32 %p37, %r2, 2;
+ setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
@@ -506,21 +506,20 @@
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r275, %f235;
- mad.lo.s32 %r276, %r417, %r3, %r10;
- mad.lo.s32 %r277, %r276, %r161, %r14;
- mul.wide.s32 %rd86, %r277, 4;
+ mad.lo.s32 %r276, %r33, %r161, %r7;
+ mul.wide.s32 %rd86, %r276, 4;
add.s64 %rd85, %rd37, %rd86;
st.global.cs.v4.s32 [%rd85], {%r272,%r273,%r274,%r275};
$L__BB0_49:
- add.s32 %r417, %r417, 1;
- setp.lt.s32 %p41, %r417, %r9;
+ add.s32 %r415, %r415, 1;
+ setp.lt.s32 %p41, %r415, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
@@ -531,31 +530,31 @@
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
- mov.u32 %r278, %tid.z;
- mad.lo.s32 %r279, %r3, %r278, %r7;
- mad.lo.s32 %r39, %r279, %r2, %r5;
+ mov.u32 %r277, %tid.z;
+ mad.lo.s32 %r278, %r4, %r277, %r8;
+ mad.lo.s32 %r39, %r278, %r3, %r6;
mul.wide.u32 %rd87, %r39, 4;
add.s64 %rd23, %rd43, %rd87;
- clz.b32 %r280, %r3;
- mov.u32 %r281, 31;
- sub.s32 %r282, %r281, %r280;
- mov.u32 %r283, 1;
- shl.b32 %r40, %r283, %r282;
- setp.lt.u32 %p42, %r7, %r40;
- add.s32 %r284, %r40, %r7;
- setp.lt.u32 %p43, %r284, %r3;
+ clz.b32 %r279, %r4;
+ mov.u32 %r280, 31;
+ sub.s32 %r281, %r280, %r279;
+ mov.u32 %r282, 1;
+ shl.b32 %r40, %r282, %r281;
+ setp.lt.u32 %p42, %r8, %r40;
+ add.s32 %r283, %r40, %r8;
+ setp.lt.u32 %p43, %r283, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r285, %r2, %r282;
- add.s32 %r286, %r39, %r285;
- mul.wide.s32 %rd89, %r286, 4;
+ shl.b32 %r284, %r3, %r281;
+ add.s32 %r285, %r39, %r284;
+ mul.wide.s32 %rd89, %r285, 4;
add.s64 %rd24, %rd43, %rd89;
- shr.u32 %r287, %r40, 31;
- add.s32 %r288, %r40, %r287;
- shr.s32 %r438, %r288, 1;
+ shr.u32 %r286, %r40, 31;
+ add.s32 %r287, %r40, %r286;
+ shr.s32 %r436, %r287, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
@@ -567,49 +566,49 @@
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
- mov.u32 %r424, %r438;
+ mov.u32 %r422, %r436;
$L__BB0_54:
- setp.ge.u32 %p46, %r7, %r424;
+ setp.ge.u32 %p46, %r8, %r422;
@%p46 bra $L__BB0_56;
- mad.lo.s32 %r289, %r424, %r2, %r39;
- mul.wide.s32 %rd90, %r289, 4;
+ mad.lo.s32 %r288, %r422, %r3, %r39;
+ mul.wide.s32 %rd90, %r288, 4;
add.s64 %rd92, %rd43, %rd90;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd92];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
- shr.u32 %r43, %r424, 1;
- setp.gt.u32 %p47, %r424, 3;
- mov.u32 %r424, %r43;
+ shr.u32 %r43, %r422, 1;
+ setp.gt.u32 %p47, %r422, 3;
+ mov.u32 %r422, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
- add.s32 %r291, %r39, %r2;
- mul.wide.u32 %rd93, %r291, 4;
+ add.s32 %r290, %r39, %r3;
+ mul.wide.u32 %rd93, %r290, 4;
add.s64 %rd25, %rd43, %rd93;
- setp.ne.s32 %p48, %r7, 0;
- mov.u32 %r425, 0;
+ setp.ne.s32 %p48, %r8, 0;
+ mov.u32 %r423, 0;
@%p48 bra $L__BB0_61;
- setp.lt.u32 %p49, %r3, 2;
+ setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
- mov.b32 %r425, %f370;
+ mov.b32 %r423, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@@ -622,45 +621,45 @@
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
- mov.u32 %r426, %r438;
+ mov.u32 %r424, %r436;
$L__BB0_65:
- setp.ge.u32 %p52, %r7, %r426;
+ setp.ge.u32 %p52, %r8, %r424;
@%p52 bra $L__BB0_67;
- mad.lo.s32 %r292, %r426, %r2, %r39;
- mul.wide.s32 %rd95, %r292, 4;
+ mad.lo.s32 %r291, %r424, %r3, %r39;
+ mul.wide.s32 %rd95, %r291, 4;
add.s64 %rd97, %rd43, %rd95;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd97];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r47, %r426, 1;
- setp.gt.u32 %p53, %r426, 3;
- mov.u32 %r426, %r47;
+ shr.u32 %r47, %r424, 1;
+ setp.gt.u32 %p53, %r424, 3;
+ mov.u32 %r424, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
- mov.u32 %r427, 0;
+ mov.u32 %r425, 0;
@%p48 bra $L__BB0_72;
- setp.lt.u32 %p55, %r3, 2;
+ setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
- mov.b32 %r427, %f371;
+ mov.b32 %r425, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@@ -673,45 +672,45 @@
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
- mov.u32 %r428, %r438;
+ mov.u32 %r426, %r436;
$L__BB0_76:
- setp.ge.u32 %p58, %r7, %r428;
+ setp.ge.u32 %p58, %r8, %r426;
@%p58 bra $L__BB0_78;
- mad.lo.s32 %r294, %r428, %r2, %r39;
- mul.wide.s32 %rd98, %r294, 4;
+ mad.lo.s32 %r293, %r426, %r3, %r39;
+ mul.wide.s32 %rd98, %r293, 4;
add.s64 %rd100, %rd43, %rd98;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd100];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
- shr.u32 %r51, %r428, 1;
- setp.gt.u32 %p59, %r428, 3;
- mov.u32 %r428, %r51;
+ shr.u32 %r51, %r426, 1;
+ setp.gt.u32 %p59, %r426, 3;
+ mov.u32 %r426, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
- mov.u32 %r429, 0;
+ mov.u32 %r427, 0;
@%p48 bra $L__BB0_83;
- setp.lt.u32 %p61, %r3, 2;
+ setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
- mov.b32 %r429, %f372;
+ mov.b32 %r427, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@@ -724,45 +723,45 @@
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
- mov.u32 %r430, %r438;
+ mov.u32 %r428, %r436;
$L__BB0_87:
- setp.ge.u32 %p64, %r7, %r430;
+ setp.ge.u32 %p64, %r8, %r428;
@%p64 bra $L__BB0_89;
- mad.lo.s32 %r296, %r430, %r2, %r39;
- mul.wide.s32 %rd101, %r296, 4;
+ mad.lo.s32 %r295, %r428, %r3, %r39;
+ mul.wide.s32 %rd101, %r295, 4;
add.s64 %rd103, %rd43, %rd101;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd103];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
- shr.u32 %r55, %r430, 1;
- setp.gt.u32 %p65, %r430, 3;
- mov.u32 %r430, %r55;
+ shr.u32 %r55, %r428, 1;
+ setp.gt.u32 %p65, %r428, 3;
+ mov.u32 %r428, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
- mov.u32 %r431, 0;
+ mov.u32 %r429, 0;
@%p48 bra $L__BB0_94;
- setp.lt.u32 %p67, %r3, 2;
+ setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
- mov.b32 %r431, %f373;
+ mov.b32 %r429, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@@ -775,45 +774,45 @@
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
- mov.u32 %r432, %r438;
+ mov.u32 %r430, %r436;
$L__BB0_98:
- setp.ge.u32 %p70, %r7, %r432;
+ setp.ge.u32 %p70, %r8, %r430;
@%p70 bra $L__BB0_100;
- mad.lo.s32 %r298, %r432, %r2, %r39;
- mul.wide.s32 %rd104, %r298, 4;
+ mad.lo.s32 %r297, %r430, %r3, %r39;
+ mul.wide.s32 %rd104, %r297, 4;
add.s64 %rd106, %rd43, %rd104;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd106];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
- shr.u32 %r59, %r432, 1;
- setp.gt.u32 %p71, %r432, 3;
- mov.u32 %r432, %r59;
+ shr.u32 %r59, %r430, 1;
+ setp.gt.u32 %p71, %r430, 3;
+ mov.u32 %r430, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
- mov.u32 %r433, 0;
+ mov.u32 %r431, 0;
@%p48 bra $L__BB0_105;
- setp.lt.u32 %p73, %r3, 2;
+ setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
- mov.b32 %r433, %f374;
+ mov.b32 %r431, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@@ -826,45 +825,45 @@
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
- mov.u32 %r434, %r438;
+ mov.u32 %r432, %r436;
$L__BB0_109:
- setp.ge.u32 %p76, %r7, %r434;
+ setp.ge.u32 %p76, %r8, %r432;
@%p76 bra $L__BB0_111;
- mad.lo.s32 %r300, %r434, %r2, %r39;
- mul.wide.s32 %rd107, %r300, 4;
+ mad.lo.s32 %r299, %r432, %r3, %r39;
+ mul.wide.s32 %rd107, %r299, 4;
add.s64 %rd109, %rd43, %rd107;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd109];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
- shr.u32 %r63, %r434, 1;
- setp.gt.u32 %p77, %r434, 3;
- mov.u32 %r434, %r63;
+ shr.u32 %r63, %r432, 1;
+ setp.gt.u32 %p77, %r432, 3;
+ mov.u32 %r432, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
- mov.u32 %r435, 0;
+ mov.u32 %r433, 0;
@%p48 bra $L__BB0_116;
- setp.lt.u32 %p79, %r3, 2;
+ setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
- mov.b32 %r435, %f375;
+ mov.b32 %r433, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@@ -877,45 +876,45 @@
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
- mov.u32 %r436, %r438;
+ mov.u32 %r434, %r436;
$L__BB0_120:
- setp.ge.u32 %p82, %r7, %r436;
+ setp.ge.u32 %p82, %r8, %r434;
@%p82 bra $L__BB0_122;
- mad.lo.s32 %r302, %r436, %r2, %r39;
- mul.wide.s32 %rd110, %r302, 4;
+ mad.lo.s32 %r301, %r434, %r3, %r39;
+ mul.wide.s32 %rd110, %r301, 4;
add.s64 %rd112, %rd43, %rd110;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd112];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
- shr.u32 %r67, %r436, 1;
- setp.gt.u32 %p83, %r436, 3;
- mov.u32 %r436, %r67;
+ shr.u32 %r67, %r434, 1;
+ setp.gt.u32 %p83, %r434, 3;
+ mov.u32 %r434, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
- mov.u32 %r437, 0;
+ mov.u32 %r435, 0;
@%p48 bra $L__BB0_127;
- setp.lt.u32 %p85, %r3, 2;
+ setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
- mov.b32 %r437, %f376;
+ mov.b32 %r435, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@@ -929,185 +928,184 @@
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
- setp.ge.u32 %p88, %r7, %r438;
+ setp.ge.u32 %p88, %r8, %r436;
@%p88 bra $L__BB0_132;
- mad.lo.s32 %r304, %r438, %r2, %r39;
- mul.wide.s32 %rd113, %r304, 4;
+ mad.lo.s32 %r303, %r436, %r3, %r39;
+ mul.wide.s32 %rd113, %r303, 4;
add.s64 %rd115, %rd43, %rd113;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd115];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
- shr.u32 %r71, %r438, 1;
- setp.gt.u32 %p89, %r438, 3;
- mov.u32 %r438, %r71;
+ shr.u32 %r71, %r436, 1;
+ setp.gt.u32 %p89, %r436, 3;
+ mov.u32 %r436, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
- mov.u32 %r439, 0;
+ mov.u32 %r437, 0;
@%p48 bra $L__BB0_137;
- setp.lt.u32 %p91, %r3, 2;
+ setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
- mov.b32 %r439, %f377;
+ mov.b32 %r437, %f377;
$L__BB0_137:
- setp.eq.s32 %p141, %r7, 0;
+ setp.eq.s32 %p141, %r8, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
- shl.b32 %r414, %r5, 2;
- mov.u32 %r314, %ctaid.y;
- mad.lo.s32 %r315, %r161, %r314, %r414;
- mul.wide.s32 %rd118, %r315, 4;
+ mov.u32 %r313, %ctaid.y;
+ mad.lo.s32 %r314, %r161, %r313, %r7;
+ mul.wide.s32 %rd118, %r314, 4;
add.s64 %rd116, %rd40, %rd118;
- st.volatile.global.v4.s32 [%rd116], {%r425,%r427,%r429,%r431};
+ st.volatile.global.v4.s32 [%rd116], {%r423,%r425,%r427,%r429};
add.s64 %rd117, %rd41, %rd118;
- st.volatile.global.v4.s32 [%rd117], {%r433,%r435,%r437,%r439};
+ st.volatile.global.v4.s32 [%rd117], {%r431,%r433,%r435,%r437};
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r316, %r5, %r7;
- or.b32 %r318, %r316, %r278;
- setp.ne.s32 %p92, %r318, 0;
+ or.b32 %r315, %r6, %r8;
+ or.b32 %r317, %r315, %r277;
+ setp.ne.s32 %p92, %r317, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd155, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd119, %rd155;
- mov.u32 %r319, %ctaid.x;
- mov.u32 %r320, %ctaid.z;
- mov.u32 %r321, %nctaid.x;
- mad.lo.s32 %r322, %r320, %r321, %r319;
- mul.wide.s32 %rd120, %r322, 8;
+ mov.u32 %r318, %ctaid.x;
+ mov.u32 %r319, %ctaid.z;
+ mov.u32 %r320, %nctaid.x;
+ mad.lo.s32 %r321, %r319, %r320, %r318;
+ mul.wide.s32 %rd120, %r321, 8;
add.s64 %rd28, %rd119, %rd120;
- add.s32 %r323, %r8, -1;
- setp.eq.s32 %p93, %r74, %r323;
- cvt.s64.s32 %rd121, %r8;
+ add.s32 %r322, %r9, -1;
+ setp.eq.s32 %p93, %r74, %r322;
+ cvt.s64.s32 %rd121, %r9;
mov.u64 %rd122, -9223372036854775807;
sub.s64 %rd123, %rd122, %rd121;
selp.b64 %rd124, %rd123, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd124;
ld.volatile.global.u64 %rd125, [%rd28];
xor.b64 %rd126, %rd125, %rd29;
setp.lt.s64 %p94, %rd126, 0;
@%p94 bra $L__BB0_143;
- mov.u32 %r440, 8;
+ mov.u32 %r438, 8;
$L__BB0_142:
- nanosleep.u32 %r440;
-
- setp.lt.u32 %p95, %r440, 256;
- selp.u32 %r326, 1, 0, %p95;
- shl.b32 %r440, %r440, %r326;
+ nanosleep.u32 %r438;
+
+ setp.lt.u32 %p95, %r438, 256;
+ selp.u32 %r325, 1, 0, %p95;
+ shl.b32 %r438, %r438, %r325;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.gt.s64 %p96, %rd128, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
- add.s32 %r327, %r8, %r2;
- add.s32 %r328, %r327, -1;
- div.s32 %r77, %r328, %r2;
+ add.s32 %r326, %r9, %r3;
+ add.s32 %r327, %r326, -1;
+ div.s32 %r77, %r327, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p97 bra $L__BB0_149;
- add.s32 %r330, %r161, 1;
- shr.u32 %r331, %r330, 31;
- add.s32 %r332, %r330, %r331;
- shr.s32 %r333, %r332, 1;
- add.s32 %r334, %r3, %r333;
- add.s32 %r335, %r334, -1;
- shl.b32 %r336, %r7, 1;
- shl.b32 %r337, %r3, 1;
- mad.lo.s32 %r338, %r337, %r74, %r336;
- or.b32 %r339, %r338, 1;
- setp.ge.s32 %p98, %r339, %r161;
- div.s32 %r340, %r335, %r3;
- setp.ge.s32 %p99, %r74, %r340;
+ add.s32 %r329, %r161, 1;
+ shr.u32 %r330, %r329, 31;
+ add.s32 %r331, %r329, %r330;
+ shr.s32 %r332, %r331, 1;
+ add.s32 %r333, %r4, %r332;
+ add.s32 %r334, %r333, -1;
+ shl.b32 %r335, %r8, 1;
+ shl.b32 %r336, %r4, 1;
+ mad.lo.s32 %r337, %r336, %r74, %r335;
+ or.b32 %r338, %r337, 1;
+ setp.ge.s32 %p98, %r338, %r161;
+ div.s32 %r339, %r334, %r4;
+ setp.ge.s32 %p99, %r74, %r339;
or.pred %p6, %p99, %p98;
- mul.lo.s32 %r341, %r3, %r74;
- shl.b32 %r342, %r341, 1;
- mad.lo.s32 %r343, %r161, %r5, %r342;
- add.s32 %r442, %r343, %r336;
- mul.lo.s32 %r79, %r161, %r2;
- mov.u32 %r329, 0;
+ mul.lo.s32 %r340, %r4, %r74;
+ shl.b32 %r341, %r340, 1;
+ mad.lo.s32 %r342, %r161, %r6, %r341;
+ add.s32 %r440, %r342, %r335;
+ mul.lo.s32 %r79, %r161, %r3;
+ mov.u32 %r328, 0;
mov.f32 %f380, 0f00000000;
- mov.u32 %r441, %r5;
- mov.u32 %r443, %r329;
+ mov.u32 %r439, %r6;
+ mov.u32 %r441, %r328;
$L__BB0_145:
.pragma "nounroll";
- mov.u32 %r444, %r329;
- mov.u32 %r445, %r329;
+ mov.u32 %r442, %r328;
+ mov.u32 %r443, %r328;
@%p6 bra $L__BB0_148;
- setp.ge.s32 %p100, %r441, %r8;
- mov.u32 %r444, %r329;
- mov.u32 %r445, %r329;
+ setp.ge.s32 %p100, %r439, %r9;
+ mov.u32 %r442, %r328;
+ mov.u32 %r443, %r328;
@%p100 bra $L__BB0_148;
- mul.wide.s32 %rd130, %r442, 4;
+ mul.wide.s32 %rd130, %r440, 4;
add.s64 %rd129, %rd41, %rd130;
- ld.volatile.global.v2.s32 {%r445,%r444}, [%rd129];
+ ld.volatile.global.v2.s32 {%r443,%r442}, [%rd129];
$L__BB0_148:
- mov.b32 %f304, %r445;
+ mov.b32 %f304, %r443;
add.f32 %f380, %f380, %f304;
- mov.b32 %f305, %r444;
+ mov.b32 %f305, %r442;
add.f32 %f381, %f381, %f305;
- add.s32 %r442, %r442, %r79;
- add.s32 %r441, %r441, %r2;
- add.s32 %r443, %r443, 1;
- setp.lt.s32 %p101, %r443, %r77;
+ add.s32 %r440, %r440, %r79;
+ add.s32 %r439, %r439, %r3;
+ add.s32 %r441, %r441, 1;
+ setp.lt.s32 %p101, %r441, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
- clz.b32 %r350, %r2;
- mov.u32 %r351, 31;
- sub.s32 %r352, %r351, %r350;
- mov.u32 %r353, 1;
- shl.b32 %r90, %r353, %r352;
- setp.lt.u32 %p102, %r5, %r90;
- add.s32 %r354, %r90, %r5;
- setp.lt.u32 %p103, %r354, %r2;
+ clz.b32 %r349, %r3;
+ mov.u32 %r350, 31;
+ sub.s32 %r351, %r350, %r349;
+ mov.u32 %r352, 1;
+ shl.b32 %r90, %r352, %r351;
+ setp.lt.u32 %p102, %r6, %r90;
+ add.s32 %r353, %r90, %r6;
+ setp.lt.u32 %p103, %r353, %r3;
and.pred %p7, %p102, %p103;
- add.s32 %r355, %r39, %r90;
- mul.wide.s32 %rd131, %r355, 4;
+ add.s32 %r354, %r39, %r90;
+ mul.wide.s32 %rd131, %r354, 4;
add.s64 %rd30, %rd43, %rd131;
- shr.u32 %r356, %r90, 31;
- add.s32 %r357, %r90, %r356;
- shr.s32 %r457, %r357, 1;
+ shr.u32 %r355, %r90, 31;
+ add.s32 %r356, %r90, %r355;
+ shr.s32 %r455, %r356, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
@@ -1119,49 +1117,49 @@
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
- mov.u32 %r446, %r457;
+ mov.u32 %r444, %r455;
$L__BB0_153:
- setp.ge.u32 %p106, %r5, %r446;
+ setp.ge.u32 %p106, %r6, %r444;
@%p106 bra $L__BB0_155;
- add.s32 %r358, %r446, %r39;
- mul.wide.s32 %rd133, %r358, 4;
+ add.s32 %r357, %r444, %r39;
+ mul.wide.s32 %rd133, %r357, 4;
add.s64 %rd135, %rd43, %rd133;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd135];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_155:
bar.sync 0;
- shr.u32 %r93, %r446, 1;
- setp.gt.u32 %p107, %r446, 3;
- mov.u32 %r446, %r93;
+ shr.u32 %r93, %r444, 1;
+ setp.gt.u32 %p107, %r444, 3;
+ mov.u32 %r444, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
- add.s32 %r360, %r39, 1;
- mul.wide.u32 %rd136, %r360, 4;
+ add.s32 %r359, %r39, 1;
+ mul.wide.u32 %rd136, %r359, 4;
add.s64 %rd31, %rd43, %rd136;
- setp.ne.s32 %p108, %r5, 0;
- mov.u32 %r447, 0;
+ setp.ne.s32 %p108, %r6, 0;
+ mov.u32 %r445, 0;
@%p108 bra $L__BB0_160;
- setp.lt.u32 %p109, %r2, 2;
+ setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_159:
- mov.b32 %r447, %f382;
+ mov.b32 %r445, %f382;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@@ -1174,131 +1172,131 @@
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
- mov.u32 %r448, %r457;
+ mov.u32 %r446, %r455;
$L__BB0_164:
- setp.ge.u32 %p112, %r5, %r448;
+ setp.ge.u32 %p112, %r6, %r446;
@%p112 bra $L__BB0_166;
- add.s32 %r361, %r448, %r39;
- mul.wide.s32 %rd138, %r361, 4;
+ add.s32 %r360, %r446, %r39;
+ mul.wide.s32 %rd138, %r360, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd140];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_166:
bar.sync 0;
- shr.u32 %r97, %r448, 1;
- setp.gt.u32 %p113, %r448, 3;
- mov.u32 %r448, %r97;
+ shr.u32 %r97, %r446, 1;
+ setp.gt.u32 %p113, %r446, 3;
+ mov.u32 %r446, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
- mov.u32 %r449, 0;
+ mov.u32 %r447, 0;
@%p108 bra $L__BB0_171;
- setp.lt.u32 %p115, %r2, 2;
+ setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_170:
- mov.b32 %r449, %f383;
+ mov.b32 %r447, %f383;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
- add.s32 %r363, %r161, 1;
- shr.u32 %r364, %r363, 31;
- add.s32 %r365, %r363, %r364;
- shr.s32 %r366, %r365, 1;
- add.s32 %r367, %r3, %r366;
- add.s32 %r368, %r367, -1;
- div.s32 %r369, %r368, %r3;
- setp.ge.s32 %p117, %r74, %r369;
+ add.s32 %r362, %r161, 1;
+ shr.u32 %r363, %r362, 31;
+ add.s32 %r364, %r362, %r363;
+ shr.s32 %r365, %r364, 1;
+ add.s32 %r366, %r4, %r365;
+ add.s32 %r367, %r366, -1;
+ div.s32 %r368, %r367, %r4;
+ setp.ge.s32 %p117, %r74, %r368;
@%p117 bra $L__BB0_175;
- shl.b32 %r100, %r7, 1;
- mul.lo.s32 %r370, %r3, %r74;
- shl.b32 %r101, %r370, 1;
- add.s32 %r371, %r100, %r101;
- or.b32 %r372, %r371, 1;
- setp.ge.s32 %p118, %r372, %r161;
+ shl.b32 %r100, %r8, 1;
+ mul.lo.s32 %r369, %r4, %r74;
+ shl.b32 %r101, %r369, 1;
+ add.s32 %r370, %r100, %r101;
+ or.b32 %r371, %r370, 1;
+ setp.ge.s32 %p118, %r371, %r161;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd154, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r375, %r101, %r100;
- mul.wide.s32 %rd142, %r375, 4;
+ add.s32 %r374, %r101, %r100;
+ mul.wide.s32 %rd142, %r374, 4;
add.s64 %rd141, %rd154, %rd142;
- st.global.cs.v2.s32 [%rd141], {%r447,%r449};
+ st.global.cs.v2.s32 [%rd141], {%r445,%r447};
$L__BB0_175:
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p97 bra $L__BB0_181;
- add.s32 %r377, %r161, 1;
- shr.u32 %r378, %r377, 31;
- add.s32 %r379, %r377, %r378;
- shr.s32 %r380, %r379, 1;
- add.s32 %r381, %r3, %r380;
- add.s32 %r382, %r381, -1;
- shl.b32 %r383, %r7, 1;
- shl.b32 %r384, %r3, 1;
- mad.lo.s32 %r385, %r384, %r74, %r383;
- or.b32 %r386, %r385, 1;
- setp.ge.s32 %p120, %r386, %r161;
- div.s32 %r387, %r382, %r3;
- setp.ge.s32 %p121, %r74, %r387;
+ add.s32 %r376, %r161, 1;
+ shr.u32 %r377, %r376, 31;
+ add.s32 %r378, %r376, %r377;
+ shr.s32 %r379, %r378, 1;
+ add.s32 %r380, %r4, %r379;
+ add.s32 %r381, %r380, -1;
+ shl.b32 %r382, %r8, 1;
+ shl.b32 %r383, %r4, 1;
+ mad.lo.s32 %r384, %r383, %r74, %r382;
+ or.b32 %r385, %r384, 1;
+ setp.ge.s32 %p120, %r385, %r161;
+ div.s32 %r386, %r381, %r4;
+ setp.ge.s32 %p121, %r74, %r386;
or.pred %p8, %p121, %p120;
- mul.lo.s32 %r388, %r3, %r74;
- shl.b32 %r389, %r388, 1;
- mad.lo.s32 %r390, %r161, %r5, %r389;
- add.s32 %r451, %r390, %r383;
- mul.lo.s32 %r103, %r161, %r2;
- mov.u32 %r376, 0;
+ mul.lo.s32 %r387, %r4, %r74;
+ shl.b32 %r388, %r387, 1;
+ mad.lo.s32 %r389, %r161, %r6, %r388;
+ add.s32 %r449, %r389, %r382;
+ mul.lo.s32 %r103, %r161, %r3;
+ mov.u32 %r375, 0;
mov.f32 %f386, 0f00000000;
- mov.u32 %r450, %r5;
- mov.u32 %r452, %r376;
+ mov.u32 %r448, %r6;
+ mov.u32 %r450, %r375;
$L__BB0_177:
.pragma "nounroll";
- mov.u32 %r453, %r376;
- mov.u32 %r454, %r376;
+ mov.u32 %r451, %r375;
+ mov.u32 %r452, %r375;
@%p8 bra $L__BB0_180;
- setp.ge.s32 %p122, %r450, %r8;
- mov.u32 %r453, %r376;
- mov.u32 %r454, %r376;
+ setp.ge.s32 %p122, %r448, %r9;
+ mov.u32 %r451, %r375;
+ mov.u32 %r452, %r375;
@%p122 bra $L__BB0_180;
- mul.wide.s32 %rd144, %r451, 4;
+ mul.wide.s32 %rd144, %r449, 4;
add.s64 %rd143, %rd40, %rd144;
- ld.volatile.global.v2.s32 {%r454,%r453}, [%rd143];
+ ld.volatile.global.v2.s32 {%r452,%r451}, [%rd143];
$L__BB0_180:
- mov.b32 %f326, %r454;
+ mov.b32 %f326, %r452;
add.f32 %f386, %f386, %f326;
- mov.b32 %f327, %r453;
+ mov.b32 %f327, %r451;
add.f32 %f387, %f387, %f327;
- add.s32 %r451, %r451, %r103;
- add.s32 %r450, %r450, %r2;
- add.s32 %r452, %r452, 1;
- setp.lt.s32 %p123, %r452, %r77;
+ add.s32 %r449, %r449, %r103;
+ add.s32 %r448, %r448, %r3;
+ add.s32 %r450, %r450, 1;
+ setp.lt.s32 %p123, %r450, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@@ -1311,45 +1309,45 @@
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
- mov.u32 %r455, %r457;
+ mov.u32 %r453, %r455;
$L__BB0_185:
- setp.ge.u32 %p126, %r5, %r455;
+ setp.ge.u32 %p126, %r6, %r453;
@%p126 bra $L__BB0_187;
- add.s32 %r397, %r455, %r39;
- mul.wide.s32 %rd145, %r397, 4;
+ add.s32 %r396, %r453, %r39;
+ mul.wide.s32 %rd145, %r396, 4;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd147];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_187:
bar.sync 0;
- shr.u32 %r115, %r455, 1;
- setp.gt.u32 %p127, %r455, 3;
- mov.u32 %r455, %r115;
+ shr.u32 %r115, %r453, 1;
+ setp.gt.u32 %p127, %r453, 3;
+ mov.u32 %r453, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
- mov.u32 %r456, 0;
+ mov.u32 %r454, 0;
@%p108 bra $L__BB0_192;
- setp.lt.u32 %p129, %r2, 2;
+ setp.lt.u32 %p129, %r3, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_191:
- mov.b32 %r456, %f388;
+ mov.b32 %r454, %f388;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@@ -1363,71 +1361,71 @@
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
- setp.ge.u32 %p132, %r5, %r457;
+ setp.ge.u32 %p132, %r6, %r455;
@%p132 bra $L__BB0_197;
- add.s32 %r399, %r457, %r39;
- mul.wide.s32 %rd148, %r399, 4;
+ add.s32 %r398, %r455, %r39;
+ mul.wide.s32 %rd148, %r398, 4;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd150];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_197:
bar.sync 0;
- shr.u32 %r119, %r457, 1;
- setp.gt.u32 %p133, %r457, 3;
- mov.u32 %r457, %r119;
+ shr.u32 %r119, %r455, 1;
+ setp.gt.u32 %p133, %r455, 3;
+ mov.u32 %r455, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
- mov.u32 %r458, 0;
+ mov.u32 %r456, 0;
@%p108 bra $L__BB0_202;
- setp.lt.u32 %p135, %r2, 2;
+ setp.lt.u32 %p135, %r3, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_201:
- mov.b32 %r458, %f389;
+ mov.b32 %r456, %f389;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
- add.s32 %r401, %r161, 1;
- shr.u32 %r402, %r401, 31;
- add.s32 %r403, %r401, %r402;
- shr.s32 %r404, %r403, 1;
- add.s32 %r405, %r3, %r404;
- add.s32 %r406, %r405, -1;
- div.s32 %r407, %r406, %r3;
- setp.ge.s32 %p137, %r74, %r407;
+ add.s32 %r400, %r161, 1;
+ shr.u32 %r401, %r400, 31;
+ add.s32 %r402, %r400, %r401;
+ shr.s32 %r403, %r402, 1;
+ add.s32 %r404, %r4, %r403;
+ add.s32 %r405, %r404, -1;
+ div.s32 %r406, %r405, %r4;
+ setp.ge.s32 %p137, %r74, %r406;
@%p137 bra $L__BB0_206;
- shl.b32 %r122, %r7, 1;
- mul.lo.s32 %r408, %r3, %r74;
- shl.b32 %r123, %r408, 1;
- add.s32 %r409, %r122, %r123;
- or.b32 %r410, %r409, 1;
- setp.ge.s32 %p138, %r410, %r161;
+ shl.b32 %r122, %r8, 1;
+ mul.lo.s32 %r407, %r4, %r74;
+ shl.b32 %r123, %r407, 1;
+ add.s32 %r408, %r122, %r123;
+ or.b32 %r409, %r408, 1;
+ setp.ge.s32 %p138, %r409, %r161;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd153, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r413, %r123, %r122;
- mul.wide.s32 %rd152, %r413, 4;
+ add.s32 %r412, %r123, %r122;
+ mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd151, %rd153, %rd152;
- st.global.cs.v2.s32 [%rd151], {%r456,%r458};
+ st.global.cs.v2.s32 [%rd151], {%r454,%r456};
$L__BB0_206:
ret;
8: CombinedSchedulerTest.LayerNormBackward/dtype_float_batch_216_hidden_96
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T56[i12]
= T56[i12]
+ T55[i12];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T37[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T37[i14], T56[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T37[0]);
}
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T51[i15]
= T51[i15]
+ T50[i15];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T36[i17] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T36[i17], T51[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T36[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<390>;
.reg .b32 %r<459>;
.reg .f64 %fd<3>;
.reg .b64 %rd<156>;
ld.param.v2.u32 {%r160, %r161}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r170, %r171}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r174, %r175}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r196, %r161, 3;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 30;
add.s32 %r199, %r196, %r198;
shr.s32 %r200, %r199, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r201, %r200, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r202, %r3, 2;
mad.lo.s32 %r203, %r202, %r201, 15;
and.b32 %r204, %r203, -16;
cvt.u64.u32 %rd1, %r204;
mul.lo.s32 %r205, %r3, %r200;
shl.b32 %r206, %r205, 4;
or.b32 %r207, %r206, 15;
and.b32 %r4, %r207, -16;
add.s32 %r208, %r207, %r4;
and.b32 %r209, %r208, -16;
cvt.s64.s32 %rd2, %r209;
mov.u64 %rd43, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p9, %r5, %r200;
shl.b32 %r6, %r5, 2;
or.b32 %r210, %r6, 3;
setp.lt.s32 %p10, %r210, %r161;
and.pred %p1, %p10, %p9;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p11, %r7, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r211, smem_ptr; }
// end inline asm
shl.b32 %r214, %r5, 4;
add.s32 %r212, %r211, %r214;
mul.wide.s32 %rd47, %r6, 4;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r213, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r213, 0;
cp.async.ca.shared.global [%r212], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r215, %r3, 215;
div.s32 %r216, %r215, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r217, %r8, %r216;
add.s32 %r218, %r217, -1;
div.s32 %r9, %r218, %r8;
setp.gt.s32 %p13, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r161;
cvt.s64.s32 %rd48, %r4;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r220, %ctaid.y;
mul.lo.s32 %r221, %r9, %r3;
mul.lo.s32 %r10, %r221, %r220;
shl.b32 %r222, %r7, 2;
shl.b32 %r223, %r5, 4;
mad.lo.s32 %r11, %r222, %r161, %r223;
mul.lo.s32 %r224, %r161, %r7;
cvt.s64.s32 %rd52, %r224;
cvt.s64.s32 %rd53, %r6;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r225, %r10, %r161;
cvt.s64.s32 %rd6, %r225;
mul.lo.s32 %r12, %r161, %r3;
mul.lo.s32 %r13, %r9, %r220;
add.s32 %r14, %r224, %r6;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r14, 4;
add.s64 %rd7, %rd54, %rd55;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r226, %tid.z;
mad.lo.s32 %r227, %r3, %r226, %r7;
mad.lo.s32 %r15, %r227, %r2, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r228, %r2;
mov.u32 %r229, 31;
sub.s32 %r230, %r229, %r228;
mov.u32 %r231, 1;
shl.b32 %r16, %r231, %r230;
setp.lt.u32 %p14, %r5, %r16;
add.s32 %r232, %r16, %r5;
setp.lt.u32 %p15, %r232, %r2;
and.pred %p3, %p14, %p15;
add.s32 %r233, %r15, %r16;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r234, %r16, 31;
add.s32 %r235, %r16, %r234;
shr.s32 %r17, %r235, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r236, %r15, 1;
mul.wide.u32 %rd58, %r236, 4;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r6, 4;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r227, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
mov.u32 %r417, 0;
mov.f32 %f354, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r239, smem_ptr; }
// end inline asm
add.s32 %r240, %r11, %r239;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r265, smem_ptr; }
// end inline asm
add.s32 %r266, %r11, %r265;
not.pred %p26, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r237, %r417, %r3, %r7;
add.s32 %r238, %r237, %r10;
setp.gt.s32 %p17, %r238, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r242, %r12, %r417;
cvt.s64.s32 %rd65, %r242;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r241, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r241, 0;
cp.async.ca.shared.global [%r240], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r243, %r13, %r417;
mad.lo.s32 %r244, %r243, %r3, %r7;
setp.lt.s32 %p19, %r244, 216;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r253, %r13, %r417;
mad.lo.s32 %r254, %r253, %r3, %r7;
setp.gt.s32 %p20, %r254, 215;
mov.u32 %r418, 0;
mov.u32 %r419, %r418;
mov.u32 %r420, %r418;
mov.u32 %r421, %r418;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r418, 0;
mov.u32 %r419, %r418;
mov.u32 %r420, %r418;
mov.u32 %r421, %r418;
$L__BB0_15:
add.s32 %r263, %r13, %r417;
mad.lo.s32 %r33, %r263, %r3, %r7;
mov.b32 %f112, %r421;
add.f32 %f369, %f369, %f112;
mov.b32 %f113, %r420;
add.f32 %f368, %f368, %f113;
mov.b32 %f114, %r419;
add.f32 %f367, %f367, %f114;
mov.b32 %f115, %r418;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p21, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r264, %r33, %r170;
mul.wide.s32 %rd69, %r264, 4;
add.s64 %rd70, %rd16, %rd69;
ld.global.f32 %f352, [%rd70];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r268, %r12, %r417;
cvt.s64.s32 %rd73, %r268;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r267, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r267, 0;
cp.async.ca.shared.global [%r266], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r416, %r13, %r417;
mad.lo.s32 %r415, %r416, %r3, %r7;
setp.gt.s32 %p142, %r415, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r269, %r33, %r174;
mul.wide.s32 %rd77, %r269, 4;
add.s64 %rd78, %rd17, %rd77;
ld.global.f32 %f353, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f359, %f358;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd7];
mul.f32 %f129, %f119, %f124;
add.f32 %f130, %f129, 0f00000000;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
sub.f32 %f136, %f131, %f352;
mul.f32 %f137, %f353, %f136;
fma.rn.f32 %f138, %f129, %f137, 0f00000000;
fma.rn.f32 %f354, %f137, %f124, %f354;
mul.f32 %f141, %f120, %f125;
add.f32 %f142, %f130, %f141;
sub.f32 %f144, %f132, %f352;
mul.f32 %f145, %f353, %f144;
fma.rn.f32 %f146, %f141, %f145, %f138;
fma.rn.f32 %f355, %f145, %f125, %f355;
mul.f32 %f149, %f121, %f126;
add.f32 %f150, %f142, %f149;
sub.f32 %f152, %f133, %f352;
mul.f32 %f153, %f353, %f152;
fma.rn.f32 %f154, %f149, %f153, %f146;
fma.rn.f32 %f356, %f153, %f126, %f356;
mul.f32 %f157, %f122, %f127;
add.f32 %f359, %f150, %f157;
sub.f32 %f159, %f134, %f352;
mul.f32 %f160, %f353, %f159;
fma.rn.f32 %f358, %f157, %f160, %f154;
fma.rn.f32 %f357, %f160, %f127, %f357;
$L__BB0_23:
st.shared.f32 [%rd8], %f359;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r422, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r5, %r422;
@%p28 bra $L__BB0_29;
add.s32 %r270, %r422, %r15;
mul.wide.s32 %rd79, %r270, 4;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd81];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r422, 1;
setp.gt.u32 %p29, %r422, 3;
mov.u32 %r422, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r5, 0;
mov.f32 %f360, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r2, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f360, %f360, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f358;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
mov.u32 %r423, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r5, %r423;
@%p34 bra $L__BB0_39;
add.s32 %r271, %r423, %r15;
mul.wide.s32 %rd82, %r271, 4;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd84];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r423, 1;
setp.gt.u32 %p35, %r423, 3;
mov.u32 %r423, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r2, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f361, %f361, %f178;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f360;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f361;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f179, %f353, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd12];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd10];
sub.f32 %f197, %f192, %f352;
mul.f32 %f198, %f353, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r272, %f202;
mul.f32 %f205, %f181, %f186;
mul.f32 %f206, %f205, %f2;
sub.f32 %f208, %f193, %f352;
mul.f32 %f209, %f353, %f208;
sub.f32 %f210, %f206, %f37;
mul.f32 %f211, %f38, %f209;
sub.f32 %f212, %f210, %f211;
mul.f32 %f213, %f179, %f212;
mov.b32 %r273, %f213;
mul.f32 %f216, %f182, %f187;
mul.f32 %f217, %f216, %f2;
sub.f32 %f219, %f194, %f352;
mul.f32 %f220, %f353, %f219;
sub.f32 %f221, %f217, %f37;
mul.f32 %f222, %f38, %f220;
sub.f32 %f223, %f221, %f222;
mul.f32 %f224, %f179, %f223;
mov.b32 %r274, %f224;
mul.f32 %f227, %f183, %f188;
mul.f32 %f228, %f227, %f2;
sub.f32 %f230, %f195, %f352;
mul.f32 %f231, %f353, %f230;
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r275, %f235;
mad.lo.s32 %r276, %r417, %r3, %r10;
mad.lo.s32 %r277, %r276, %r161, %r14;
mul.wide.s32 %rd86, %r277, 4;
add.s64 %rd85, %rd37, %rd86;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r272,%r273,%r274,%r275};
// end inline asm
$L__BB0_49:
add.s32 %r417, %r417, 1;
setp.lt.s32 %p41, %r417, %r9;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
mov.u32 %r278, %tid.z;
mad.lo.s32 %r279, %r3, %r278, %r7;
mad.lo.s32 %r39, %r279, %r2, %r5;
mul.wide.u32 %rd87, %r39, 4;
add.s64 %rd23, %rd43, %rd87;
clz.b32 %r280, %r3;
mov.u32 %r281, 31;
sub.s32 %r282, %r281, %r280;
mov.u32 %r283, 1;
shl.b32 %r40, %r283, %r282;
setp.lt.u32 %p42, %r7, %r40;
add.s32 %r284, %r40, %r7;
setp.lt.u32 %p43, %r284, %r3;
and.pred %p5, %p42, %p43;
shl.b32 %r285, %r2, %r282;
add.s32 %r286, %r39, %r285;
mul.wide.s32 %rd89, %r286, 4;
add.s64 %rd24, %rd43, %rd89;
shr.u32 %r287, %r40, 31;
add.s32 %r288, %r40, %r287;
shr.s32 %r438, %r288, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f236, [%rd24];
ld.shared.f32 %f237, [%rd23];
add.f32 %f238, %f236, %f237;
st.shared.f32 [%rd23], %f238;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r424, %r438;
$L__BB0_54:
setp.ge.u32 %p46, %r7, %r424;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r289, %r424, %r2, %r39;
mul.wide.s32 %rd90, %r289, 4;
add.s64 %rd92, %rd43, %rd90;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd92];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r424, 1;
setp.gt.u32 %p47, %r424, 3;
mov.u32 %r424, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r291, %r39, %r2;
mul.wide.u32 %rd93, %r291, 4;
add.s64 %rd25, %rd43, %rd93;
setp.ne.s32 %p48, %r7, 0;
mov.u32 %r425, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r3, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
mov.b32 %r425, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f244, [%rd24];
ld.shared.f32 %f245, [%rd23];
add.f32 %f246, %f244, %f245;
st.shared.f32 [%rd23], %f246;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r426, %r438;
$L__BB0_65:
setp.ge.u32 %p52, %r7, %r426;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r292, %r426, %r2, %r39;
mul.wide.s32 %rd95, %r292, 4;
add.s64 %rd97, %rd43, %rd95;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd97];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r426, 1;
setp.gt.u32 %p53, %r426, 3;
mov.u32 %r426, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r427, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r3, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
mov.b32 %r427, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f252, [%rd24];
ld.shared.f32 %f253, [%rd23];
add.f32 %f254, %f252, %f253;
st.shared.f32 [%rd23], %f254;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r428, %r438;
$L__BB0_76:
setp.ge.u32 %p58, %r7, %r428;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r294, %r428, %r2, %r39;
mul.wide.s32 %rd98, %r294, 4;
add.s64 %rd100, %rd43, %rd98;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd100];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r428, 1;
setp.gt.u32 %p59, %r428, 3;
mov.u32 %r428, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r429, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r3, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
mov.b32 %r429, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f260, [%rd24];
ld.shared.f32 %f261, [%rd23];
add.f32 %f262, %f260, %f261;
st.shared.f32 [%rd23], %f262;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r430, %r438;
$L__BB0_87:
setp.ge.u32 %p64, %r7, %r430;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r296, %r430, %r2, %r39;
mul.wide.s32 %rd101, %r296, 4;
add.s64 %rd103, %rd43, %rd101;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd103];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r430, 1;
setp.gt.u32 %p65, %r430, 3;
mov.u32 %r430, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r431, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r3, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
mov.b32 %r431, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f268, [%rd24];
ld.shared.f32 %f269, [%rd23];
add.f32 %f270, %f268, %f269;
st.shared.f32 [%rd23], %f270;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r432, %r438;
$L__BB0_98:
setp.ge.u32 %p70, %r7, %r432;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r298, %r432, %r2, %r39;
mul.wide.s32 %rd104, %r298, 4;
add.s64 %rd106, %rd43, %rd104;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd106];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r432, 1;
setp.gt.u32 %p71, %r432, 3;
mov.u32 %r432, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r433, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r3, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
mov.b32 %r433, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f276, [%rd24];
ld.shared.f32 %f277, [%rd23];
add.f32 %f278, %f276, %f277;
st.shared.f32 [%rd23], %f278;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r434, %r438;
$L__BB0_109:
setp.ge.u32 %p76, %r7, %r434;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r300, %r434, %r2, %r39;
mul.wide.s32 %rd107, %r300, 4;
add.s64 %rd109, %rd43, %rd107;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd109];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r434, 1;
setp.gt.u32 %p77, %r434, 3;
mov.u32 %r434, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r435, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r3, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
mov.b32 %r435, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f284, [%rd24];
ld.shared.f32 %f285, [%rd23];
add.f32 %f286, %f284, %f285;
st.shared.f32 [%rd23], %f286;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r436, %r438;
$L__BB0_120:
setp.ge.u32 %p82, %r7, %r436;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r302, %r436, %r2, %r39;
mul.wide.s32 %rd110, %r302, 4;
add.s64 %rd112, %rd43, %rd110;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd112];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r436, 1;
setp.gt.u32 %p83, %r436, 3;
mov.u32 %r436, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r437, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
mov.b32 %r437, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f292, [%rd24];
ld.shared.f32 %f293, [%rd23];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd23], %f294;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r7, %r438;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r304, %r438, %r2, %r39;
mul.wide.s32 %rd113, %r304, 4;
add.s64 %rd115, %rd43, %rd113;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd115];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r438, 1;
setp.gt.u32 %p89, %r438, 3;
mov.u32 %r438, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r439, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
mov.b32 %r439, %f377;
$L__BB0_137:
setp.eq.s32 %p141, %r7, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
shl.b32 %r414, %r5, 2;
mov.u32 %r314, %ctaid.y;
mad.lo.s32 %r315, %r161, %r314, %r414;
mul.wide.s32 %rd118, %r315, 4;
add.s64 %rd116, %rd40, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd116], {%r425,%r427,%r429,%r431};
// end inline asm
add.s64 %rd117, %rd41, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd117], {%r433,%r435,%r437,%r439};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r316, %r5, %r7;
or.b32 %r318, %r316, %r278;
setp.ne.s32 %p92, %r318, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd155, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd119, %rd155;
mov.u32 %r319, %ctaid.x;
mov.u32 %r320, %ctaid.z;
mov.u32 %r321, %nctaid.x;
mad.lo.s32 %r322, %r320, %r321, %r319;
mul.wide.s32 %rd120, %r322, 8;
add.s64 %rd28, %rd119, %rd120;
add.s32 %r323, %r8, -1;
setp.eq.s32 %p93, %r74, %r323;
cvt.s64.s32 %rd121, %r8;
mov.u64 %rd122, -9223372036854775807;
sub.s64 %rd123, %rd122, %rd121;
selp.b64 %rd124, %rd123, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd124;
ld.volatile.global.u64 %rd125, [%rd28];
xor.b64 %rd126, %rd125, %rd29;
setp.lt.s64 %p94, %rd126, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r440, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r440;
// end inline asm
setp.lt.u32 %p95, %r440, 256;
selp.u32 %r326, 1, 0, %p95;
shl.b32 %r440, %r440, %r326;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.gt.s64 %p96, %rd128, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r327, %r8, %r2;
add.s32 %r328, %r327, -1;
div.s32 %r77, %r328, %r2;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p97 bra $L__BB0_149;
add.s32 %r330, %r161, 1;
shr.u32 %r331, %r330, 31;
add.s32 %r332, %r330, %r331;
shr.s32 %r333, %r332, 1;
add.s32 %r334, %r3, %r333;
add.s32 %r335, %r334, -1;
shl.b32 %r336, %r7, 1;
shl.b32 %r337, %r3, 1;
mad.lo.s32 %r338, %r337, %r74, %r336;
or.b32 %r339, %r338, 1;
setp.ge.s32 %p98, %r339, %r161;
div.s32 %r340, %r335, %r3;
setp.ge.s32 %p99, %r74, %r340;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r341, %r3, %r74;
shl.b32 %r342, %r341, 1;
mad.lo.s32 %r343, %r161, %r5, %r342;
add.s32 %r442, %r343, %r336;
mul.lo.s32 %r79, %r161, %r2;
mov.u32 %r329, 0;
mov.f32 %f380, 0f00000000;
mov.u32 %r441, %r5;
mov.u32 %r443, %r329;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r444, %r329;
mov.u32 %r445, %r329;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r441, %r8;
mov.u32 %r444, %r329;
mov.u32 %r445, %r329;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd130, %r442, 4;
add.s64 %rd129, %rd41, %rd130;
// begin inline asm
ld.volatile.global.v2.s32 {%r445,%r444}, [%rd129];
// end inline asm
$L__BB0_148:
mov.b32 %f304, %r445;
add.f32 %f380, %f380, %f304;
mov.b32 %f305, %r444;
add.f32 %f381, %f381, %f305;
add.s32 %r442, %r442, %r79;
add.s32 %r441, %r441, %r2;
add.s32 %r443, %r443, 1;
setp.lt.s32 %p101, %r443, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r350, %r2;
mov.u32 %r351, 31;
sub.s32 %r352, %r351, %r350;
mov.u32 %r353, 1;
shl.b32 %r90, %r353, %r352;
setp.lt.u32 %p102, %r5, %r90;
add.s32 %r354, %r90, %r5;
setp.lt.u32 %p103, %r354, %r2;
and.pred %p7, %p102, %p103;
add.s32 %r355, %r39, %r90;
mul.wide.s32 %rd131, %r355, 4;
add.s64 %rd30, %rd43, %rd131;
shr.u32 %r356, %r90, 31;
add.s32 %r357, %r90, %r356;
shr.s32 %r457, %r357, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f306, [%rd30];
ld.shared.f32 %f307, [%rd23];
add.f32 %f308, %f306, %f307;
st.shared.f32 [%rd23], %f308;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r446, %r457;
$L__BB0_153:
setp.ge.u32 %p106, %r5, %r446;
@%p106 bra $L__BB0_155;
add.s32 %r358, %r446, %r39;
mul.wide.s32 %rd133, %r358, 4;
add.s64 %rd135, %rd43, %rd133;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd135];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r446, 1;
setp.gt.u32 %p107, %r446, 3;
mov.u32 %r446, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r360, %r39, 1;
mul.wide.u32 %rd136, %r360, 4;
add.s64 %rd31, %rd43, %rd136;
setp.ne.s32 %p108, %r5, 0;
mov.u32 %r447, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r2, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_159:
mov.b32 %r447, %f382;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f314, [%rd30];
ld.shared.f32 %f315, [%rd23];
add.f32 %f316, %f314, %f315;
st.shared.f32 [%rd23], %f316;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r448, %r457;
$L__BB0_164:
setp.ge.u32 %p112, %r5, %r448;
@%p112 bra $L__BB0_166;
add.s32 %r361, %r448, %r39;
mul.wide.s32 %rd138, %r361, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd140];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r448, 1;
setp.gt.u32 %p113, %r448, 3;
mov.u32 %r448, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r449, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r2, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_170:
mov.b32 %r449, %f383;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
add.s32 %r363, %r161, 1;
shr.u32 %r364, %r363, 31;
add.s32 %r365, %r363, %r364;
shr.s32 %r366, %r365, 1;
add.s32 %r367, %r3, %r366;
add.s32 %r368, %r367, -1;
div.s32 %r369, %r368, %r3;
setp.ge.s32 %p117, %r74, %r369;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r7, 1;
mul.lo.s32 %r370, %r3, %r74;
shl.b32 %r101, %r370, 1;
add.s32 %r371, %r100, %r101;
or.b32 %r372, %r371, 1;
setp.ge.s32 %p118, %r372, %r161;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd154, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r375, %r101, %r100;
mul.wide.s32 %rd142, %r375, 4;
add.s64 %rd141, %rd154, %rd142;
// begin inline asm
st.global.cs.v2.s32 [%rd141], {%r447,%r449};
// end inline asm
$L__BB0_175:
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p97 bra $L__BB0_181;
add.s32 %r377, %r161, 1;
shr.u32 %r378, %r377, 31;
add.s32 %r379, %r377, %r378;
shr.s32 %r380, %r379, 1;
add.s32 %r381, %r3, %r380;
add.s32 %r382, %r381, -1;
shl.b32 %r383, %r7, 1;
shl.b32 %r384, %r3, 1;
mad.lo.s32 %r385, %r384, %r74, %r383;
or.b32 %r386, %r385, 1;
setp.ge.s32 %p120, %r386, %r161;
div.s32 %r387, %r382, %r3;
setp.ge.s32 %p121, %r74, %r387;
or.pred %p8, %p121, %p120;
mul.lo.s32 %r388, %r3, %r74;
shl.b32 %r389, %r388, 1;
mad.lo.s32 %r390, %r161, %r5, %r389;
add.s32 %r451, %r390, %r383;
mul.lo.s32 %r103, %r161, %r2;
mov.u32 %r376, 0;
mov.f32 %f386, 0f00000000;
mov.u32 %r450, %r5;
mov.u32 %r452, %r376;
$L__BB0_177:
.pragma "nounroll";
mov.u32 %r453, %r376;
mov.u32 %r454, %r376;
@%p8 bra $L__BB0_180;
setp.ge.s32 %p122, %r450, %r8;
mov.u32 %r453, %r376;
mov.u32 %r454, %r376;
@%p122 bra $L__BB0_180;
mul.wide.s32 %rd144, %r451, 4;
add.s64 %rd143, %rd40, %rd144;
// begin inline asm
ld.volatile.global.v2.s32 {%r454,%r453}, [%rd143];
// end inline asm
$L__BB0_180:
mov.b32 %f326, %r454;
add.f32 %f386, %f386, %f326;
mov.b32 %f327, %r453;
add.f32 %f387, %f387, %f327;
add.s32 %r451, %r451, %r103;
add.s32 %r450, %r450, %r2;
add.s32 %r452, %r452, 1;
setp.lt.s32 %p123, %r452, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@%p104 bra $L__BB0_183;
ld.shared.f32 %f328, [%rd30];
ld.shared.f32 %f329, [%rd23];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd23], %f330;
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
mov.u32 %r455, %r457;
$L__BB0_185:
setp.ge.u32 %p126, %r5, %r455;
@%p126 bra $L__BB0_187;
add.s32 %r397, %r455, %r39;
mul.wide.s32 %rd145, %r397, 4;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd147];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_187:
bar.sync 0;
shr.u32 %r115, %r455, 1;
setp.gt.u32 %p127, %r455, 3;
mov.u32 %r455, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
mov.u32 %r456, 0;
@%p108 bra $L__BB0_192;
setp.lt.u32 %p129, %r2, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_191:
mov.b32 %r456, %f388;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@%p104 bra $L__BB0_194;
ld.shared.f32 %f336, [%rd30];
ld.shared.f32 %f337, [%rd23];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd23], %f338;
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
setp.ge.u32 %p132, %r5, %r457;
@%p132 bra $L__BB0_197;
add.s32 %r399, %r457, %r39;
mul.wide.s32 %rd148, %r399, 4;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd150];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_197:
bar.sync 0;
shr.u32 %r119, %r457, 1;
setp.gt.u32 %p133, %r457, 3;
mov.u32 %r457, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
mov.u32 %r458, 0;
@%p108 bra $L__BB0_202;
setp.lt.u32 %p135, %r2, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_201:
mov.b32 %r458, %f389;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
add.s32 %r401, %r161, 1;
shr.u32 %r402, %r401, 31;
add.s32 %r403, %r401, %r402;
shr.s32 %r404, %r403, 1;
add.s32 %r405, %r3, %r404;
add.s32 %r406, %r405, -1;
div.s32 %r407, %r406, %r3;
setp.ge.s32 %p137, %r74, %r407;
@%p137 bra $L__BB0_206;
shl.b32 %r122, %r7, 1;
mul.lo.s32 %r408, %r3, %r74;
shl.b32 %r123, %r408, 1;
add.s32 %r409, %r122, %r123;
or.b32 %r410, %r409, 1;
setp.ge.s32 %p138, %r410, %r161;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd153, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_7b5a152a_1033910nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r413, %r123, %r122;
mul.wide.s32 %rd152, %r413, 4;
add.s64 %rd151, %rd153, %rd152;
// begin inline asm
st.global.cs.v2.s32 [%rd151], {%r456,%r458};
// end inline asm
$L__BB0_206:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<390>;
.reg .b32 %r<457>;
.reg .f64 %fd<3>;
.reg .b64 %rd<156>;
ld.param.v2.u32 {%r160, %r161}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r170, %r171}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r174, %r175}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r196, %r161, 3;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 30;
add.s32 %r199, %r196, %r198;
shr.s32 %r2, %r199, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r200, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r201, %r4, 2;
mad.lo.s32 %r202, %r201, %r200, 15;
and.b32 %r203, %r202, -16;
cvt.u64.u32 %rd1, %r203;
mul.lo.s32 %r204, %r4, %r2;
shl.b32 %r205, %r204, 4;
or.b32 %r206, %r205, 15;
and.b32 %r5, %r206, -16;
add.s32 %r207, %r206, %r5;
and.b32 %r208, %r207, -16;
cvt.s64.s32 %rd2, %r208;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r209, %r7, 3;
setp.lt.s32 %p10, %r209, %r161;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r210, smem_ptr; }
// end inline asm
shl.b32 %r213, %r6, 4;
add.s32 %r211, %r210, %r213;
mul.wide.s32 %rd47, %r7, 4;
add.s64 %rd46, %rd36, %rd47;
mov.u32 %r212, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r212, 0;
cp.async.ca.shared.global [%r211], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r214, %r4, 215;
div.s32 %r215, %r214, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r216, %r9, %r215;
add.s32 %r217, %r216, -1;
div.s32 %r10, %r217, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r161;
cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
mov.u32 %r219, %ctaid.y;
mul.lo.s32 %r220, %r10, %r4;
mul.lo.s32 %r11, %r220, %r219;
mad.lo.s32 %r221, %r2, %r8, %r6;
shl.b32 %r12, %r221, 4;
mul.lo.s32 %r222, %r161, %r8;
cvt.s64.s32 %rd52, %r222;
cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r223, %r11, %r161;
cvt.s64.s32 %rd6, %r223;
mul.lo.s32 %r13, %r161, %r4;
mul.lo.s32 %r14, %r10, %r219;
shl.b32 %r224, %r8, 2;
mad.lo.s32 %r225, %r224, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
mul.wide.s32 %rd55, %r225, 4;
add.s64 %rd7, %rd54, %rd55;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r226, %tid.z;
mad.lo.s32 %r227, %r4, %r226, %r8;
mad.lo.s32 %r15, %r227, %r3, %r6;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd8, %rd43, %rd56;
clz.b32 %r228, %r3;
mov.u32 %r229, 31;
sub.s32 %r230, %r229, %r228;
mov.u32 %r231, 1;
shl.b32 %r16, %r231, %r230;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r232, %r16, %r6;
setp.lt.u32 %p15, %r232, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r233, %r15, %r16;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r234, %r16, 31;
add.s32 %r235, %r16, %r234;
shr.s32 %r17, %r235, 1;
add.s64 %rd10, %rd51, %rd55;
add.s32 %r236, %r15, 1;
mul.wide.u32 %rd58, %r236, 4;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r227, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
mov.u32 %r415, 0;
mov.f32 %f354, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r239, smem_ptr; }
// end inline asm
add.s32 %r240, %r239, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r265, smem_ptr; }
// end inline asm
add.s32 %r266, %r265, %r12;
not.pred %p26, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r237, %r415, %r4, %r8;
add.s32 %r238, %r237, %r11;
setp.gt.s32 %p17, %r238, 215;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r242, %r13, %r415;
cvt.s64.s32 %rd65, %r242;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd32, %rd68;
mov.u32 %r241, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r241, 0;
cp.async.ca.shared.global [%r240], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r243, %r14, %r415;
mad.lo.s32 %r244, %r243, %r4, %r8;
setp.lt.s32 %p19, %r244, 216;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r253, %r14, %r415;
mad.lo.s32 %r254, %r253, %r4, %r8;
setp.gt.s32 %p20, %r254, 215;
mov.u32 %r416, 0;
mov.u32 %r417, %r416;
mov.u32 %r418, %r416;
mov.u32 %r419, %r416;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r416, 0;
mov.u32 %r417, %r416;
mov.u32 %r418, %r416;
mov.u32 %r419, %r416;
$L__BB0_15:
add.s32 %r263, %r14, %r415;
mad.lo.s32 %r33, %r263, %r4, %r8;
mov.b32 %f112, %r419;
add.f32 %f369, %f369, %f112;
mov.b32 %f113, %r418;
add.f32 %f368, %f368, %f113;
mov.b32 %f114, %r417;
add.f32 %f367, %f367, %f114;
mov.b32 %f115, %r416;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p21, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r264, %r33, %r170;
mul.wide.s32 %rd69, %r264, 4;
add.s64 %rd70, %rd16, %rd69;
ld.global.f32 %f352, [%rd70];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r268, %r13, %r415;
cvt.s64.s32 %rd73, %r268;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd33, %rd76;
mov.u32 %r267, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r267, 0;
cp.async.ca.shared.global [%r266], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r414, %r14, %r415;
mad.lo.s32 %r413, %r414, %r4, %r8;
setp.gt.s32 %p142, %r413, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r269, %r33, %r174;
mul.wide.s32 %rd77, %r269, 4;
add.s64 %rd78, %rd17, %rd77;
ld.global.f32 %f353, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f359, %f358;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd7];
mul.f32 %f129, %f119, %f124;
add.f32 %f130, %f129, 0f00000000;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
sub.f32 %f136, %f131, %f352;
mul.f32 %f137, %f353, %f136;
fma.rn.f32 %f138, %f129, %f137, 0f00000000;
fma.rn.f32 %f354, %f137, %f124, %f354;
mul.f32 %f141, %f120, %f125;
add.f32 %f142, %f130, %f141;
sub.f32 %f144, %f132, %f352;
mul.f32 %f145, %f353, %f144;
fma.rn.f32 %f146, %f141, %f145, %f138;
fma.rn.f32 %f355, %f145, %f125, %f355;
mul.f32 %f149, %f121, %f126;
add.f32 %f150, %f142, %f149;
sub.f32 %f152, %f133, %f352;
mul.f32 %f153, %f353, %f152;
fma.rn.f32 %f154, %f149, %f153, %f146;
fma.rn.f32 %f356, %f153, %f126, %f356;
mul.f32 %f157, %f122, %f127;
add.f32 %f359, %f150, %f157;
sub.f32 %f159, %f134, %f352;
mul.f32 %f160, %f353, %f159;
fma.rn.f32 %f358, %f157, %f160, %f154;
fma.rn.f32 %f357, %f160, %f127, %f357;
$L__BB0_23:
st.shared.f32 [%rd8], %f359;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r420, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r6, %r420;
@%p28 bra $L__BB0_29;
add.s32 %r270, %r420, %r15;
mul.wide.s32 %rd79, %r270, 4;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd81];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r420, 1;
setp.gt.u32 %p29, %r420, 3;
mov.u32 %r420, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r6, 0;
mov.f32 %f360, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f360, %f360, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f358;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
mov.u32 %r421, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r6, %r421;
@%p34 bra $L__BB0_39;
add.s32 %r271, %r421, %r15;
mul.wide.s32 %rd82, %r271, 4;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd84];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r421, 1;
setp.gt.u32 %p35, %r421, 3;
mov.u32 %r421, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f361, %f361, %f178;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f360;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f361;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f179, %f353, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd12];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd10];
sub.f32 %f197, %f192, %f352;
mul.f32 %f198, %f353, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r272, %f202;
mul.f32 %f205, %f181, %f186;
mul.f32 %f206, %f205, %f2;
sub.f32 %f208, %f193, %f352;
mul.f32 %f209, %f353, %f208;
sub.f32 %f210, %f206, %f37;
mul.f32 %f211, %f38, %f209;
sub.f32 %f212, %f210, %f211;
mul.f32 %f213, %f179, %f212;
mov.b32 %r273, %f213;
mul.f32 %f216, %f182, %f187;
mul.f32 %f217, %f216, %f2;
sub.f32 %f219, %f194, %f352;
mul.f32 %f220, %f353, %f219;
sub.f32 %f221, %f217, %f37;
mul.f32 %f222, %f38, %f220;
sub.f32 %f223, %f221, %f222;
mul.f32 %f224, %f179, %f223;
mov.b32 %r274, %f224;
mul.f32 %f227, %f183, %f188;
mul.f32 %f228, %f227, %f2;
sub.f32 %f230, %f195, %f352;
mul.f32 %f231, %f353, %f230;
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r275, %f235;
mad.lo.s32 %r276, %r33, %r161, %r7;
mul.wide.s32 %rd86, %r276, 4;
add.s64 %rd85, %rd37, %rd86;
// begin inline asm
st.global.cs.v4.s32 [%rd85], {%r272,%r273,%r274,%r275};
// end inline asm
$L__BB0_49:
add.s32 %r415, %r415, 1;
setp.lt.s32 %p41, %r415, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
mov.u32 %r277, %tid.z;
mad.lo.s32 %r278, %r4, %r277, %r8;
mad.lo.s32 %r39, %r278, %r3, %r6;
mul.wide.u32 %rd87, %r39, 4;
add.s64 %rd23, %rd43, %rd87;
clz.b32 %r279, %r4;
mov.u32 %r280, 31;
sub.s32 %r281, %r280, %r279;
mov.u32 %r282, 1;
shl.b32 %r40, %r282, %r281;
setp.lt.u32 %p42, %r8, %r40;
add.s32 %r283, %r40, %r8;
setp.lt.u32 %p43, %r283, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r284, %r3, %r281;
add.s32 %r285, %r39, %r284;
mul.wide.s32 %rd89, %r285, 4;
add.s64 %rd24, %rd43, %rd89;
shr.u32 %r286, %r40, 31;
add.s32 %r287, %r40, %r286;
shr.s32 %r436, %r287, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f236, [%rd24];
ld.shared.f32 %f237, [%rd23];
add.f32 %f238, %f236, %f237;
st.shared.f32 [%rd23], %f238;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r422, %r436;
$L__BB0_54:
setp.ge.u32 %p46, %r8, %r422;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r288, %r422, %r3, %r39;
mul.wide.s32 %rd90, %r288, 4;
add.s64 %rd92, %rd43, %rd90;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd92];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r422, 1;
setp.gt.u32 %p47, %r422, 3;
mov.u32 %r422, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r290, %r39, %r3;
mul.wide.u32 %rd93, %r290, 4;
add.s64 %rd25, %rd43, %rd93;
setp.ne.s32 %p48, %r8, 0;
mov.u32 %r423, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
mov.b32 %r423, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f244, [%rd24];
ld.shared.f32 %f245, [%rd23];
add.f32 %f246, %f244, %f245;
st.shared.f32 [%rd23], %f246;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r424, %r436;
$L__BB0_65:
setp.ge.u32 %p52, %r8, %r424;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r291, %r424, %r3, %r39;
mul.wide.s32 %rd95, %r291, 4;
add.s64 %rd97, %rd43, %rd95;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd97];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r424, 1;
setp.gt.u32 %p53, %r424, 3;
mov.u32 %r424, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r425, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
mov.b32 %r425, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f252, [%rd24];
ld.shared.f32 %f253, [%rd23];
add.f32 %f254, %f252, %f253;
st.shared.f32 [%rd23], %f254;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r426, %r436;
$L__BB0_76:
setp.ge.u32 %p58, %r8, %r426;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r293, %r426, %r3, %r39;
mul.wide.s32 %rd98, %r293, 4;
add.s64 %rd100, %rd43, %rd98;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd100];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r426, 1;
setp.gt.u32 %p59, %r426, 3;
mov.u32 %r426, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r427, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
mov.b32 %r427, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f260, [%rd24];
ld.shared.f32 %f261, [%rd23];
add.f32 %f262, %f260, %f261;
st.shared.f32 [%rd23], %f262;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r428, %r436;
$L__BB0_87:
setp.ge.u32 %p64, %r8, %r428;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r295, %r428, %r3, %r39;
mul.wide.s32 %rd101, %r295, 4;
add.s64 %rd103, %rd43, %rd101;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd103];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r428, 1;
setp.gt.u32 %p65, %r428, 3;
mov.u32 %r428, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r429, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
mov.b32 %r429, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f268, [%rd24];
ld.shared.f32 %f269, [%rd23];
add.f32 %f270, %f268, %f269;
st.shared.f32 [%rd23], %f270;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r430, %r436;
$L__BB0_98:
setp.ge.u32 %p70, %r8, %r430;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r297, %r430, %r3, %r39;
mul.wide.s32 %rd104, %r297, 4;
add.s64 %rd106, %rd43, %rd104;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd106];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r430, 1;
setp.gt.u32 %p71, %r430, 3;
mov.u32 %r430, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r431, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
mov.b32 %r431, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f276, [%rd24];
ld.shared.f32 %f277, [%rd23];
add.f32 %f278, %f276, %f277;
st.shared.f32 [%rd23], %f278;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r432, %r436;
$L__BB0_109:
setp.ge.u32 %p76, %r8, %r432;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r299, %r432, %r3, %r39;
mul.wide.s32 %rd107, %r299, 4;
add.s64 %rd109, %rd43, %rd107;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd109];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r432, 1;
setp.gt.u32 %p77, %r432, 3;
mov.u32 %r432, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r433, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
mov.b32 %r433, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f284, [%rd24];
ld.shared.f32 %f285, [%rd23];
add.f32 %f286, %f284, %f285;
st.shared.f32 [%rd23], %f286;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r434, %r436;
$L__BB0_120:
setp.ge.u32 %p82, %r8, %r434;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r301, %r434, %r3, %r39;
mul.wide.s32 %rd110, %r301, 4;
add.s64 %rd112, %rd43, %rd110;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd112];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r434, 1;
setp.gt.u32 %p83, %r434, 3;
mov.u32 %r434, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r435, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
mov.b32 %r435, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f292, [%rd24];
ld.shared.f32 %f293, [%rd23];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd23], %f294;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r8, %r436;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r303, %r436, %r3, %r39;
mul.wide.s32 %rd113, %r303, 4;
add.s64 %rd115, %rd43, %rd113;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd115];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r436, 1;
setp.gt.u32 %p89, %r436, 3;
mov.u32 %r436, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r437, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
mov.b32 %r437, %f377;
$L__BB0_137:
setp.eq.s32 %p141, %r8, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
mov.u32 %r313, %ctaid.y;
mad.lo.s32 %r314, %r161, %r313, %r7;
mul.wide.s32 %rd118, %r314, 4;
add.s64 %rd116, %rd40, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd116], {%r423,%r425,%r427,%r429};
// end inline asm
add.s64 %rd117, %rd41, %rd118;
// begin inline asm
st.volatile.global.v4.s32 [%rd117], {%r431,%r433,%r435,%r437};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r315, %r6, %r8;
or.b32 %r317, %r315, %r277;
setp.ne.s32 %p92, %r317, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd155, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd119, %rd155;
mov.u32 %r318, %ctaid.x;
mov.u32 %r319, %ctaid.z;
mov.u32 %r320, %nctaid.x;
mad.lo.s32 %r321, %r319, %r320, %r318;
mul.wide.s32 %rd120, %r321, 8;
add.s64 %rd28, %rd119, %rd120;
add.s32 %r322, %r9, -1;
setp.eq.s32 %p93, %r74, %r322;
cvt.s64.s32 %rd121, %r9;
mov.u64 %rd122, -9223372036854775807;
sub.s64 %rd123, %rd122, %rd121;
selp.b64 %rd124, %rd123, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd124;
ld.volatile.global.u64 %rd125, [%rd28];
xor.b64 %rd126, %rd125, %rd29;
setp.lt.s64 %p94, %rd126, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r438, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r438;
// end inline asm
setp.lt.u32 %p95, %r438, 256;
selp.u32 %r325, 1, 0, %p95;
shl.b32 %r438, %r438, %r325;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.gt.s64 %p96, %rd128, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r326, %r9, %r3;
add.s32 %r327, %r326, -1;
div.s32 %r77, %r327, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p97 bra $L__BB0_149;
add.s32 %r329, %r161, 1;
shr.u32 %r330, %r329, 31;
add.s32 %r331, %r329, %r330;
shr.s32 %r332, %r331, 1;
add.s32 %r333, %r4, %r332;
add.s32 %r334, %r333, -1;
shl.b32 %r335, %r8, 1;
shl.b32 %r336, %r4, 1;
mad.lo.s32 %r337, %r336, %r74, %r335;
or.b32 %r338, %r337, 1;
setp.ge.s32 %p98, %r338, %r161;
div.s32 %r339, %r334, %r4;
setp.ge.s32 %p99, %r74, %r339;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r340, %r4, %r74;
shl.b32 %r341, %r340, 1;
mad.lo.s32 %r342, %r161, %r6, %r341;
add.s32 %r440, %r342, %r335;
mul.lo.s32 %r79, %r161, %r3;
mov.u32 %r328, 0;
mov.f32 %f380, 0f00000000;
mov.u32 %r439, %r6;
mov.u32 %r441, %r328;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r442, %r328;
mov.u32 %r443, %r328;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r439, %r9;
mov.u32 %r442, %r328;
mov.u32 %r443, %r328;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd130, %r440, 4;
add.s64 %rd129, %rd41, %rd130;
// begin inline asm
ld.volatile.global.v2.s32 {%r443,%r442}, [%rd129];
// end inline asm
$L__BB0_148:
mov.b32 %f304, %r443;
add.f32 %f380, %f380, %f304;
mov.b32 %f305, %r442;
add.f32 %f381, %f381, %f305;
add.s32 %r440, %r440, %r79;
add.s32 %r439, %r439, %r3;
add.s32 %r441, %r441, 1;
setp.lt.s32 %p101, %r441, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r349, %r3;
mov.u32 %r350, 31;
sub.s32 %r351, %r350, %r349;
mov.u32 %r352, 1;
shl.b32 %r90, %r352, %r351;
setp.lt.u32 %p102, %r6, %r90;
add.s32 %r353, %r90, %r6;
setp.lt.u32 %p103, %r353, %r3;
and.pred %p7, %p102, %p103;
add.s32 %r354, %r39, %r90;
mul.wide.s32 %rd131, %r354, 4;
add.s64 %rd30, %rd43, %rd131;
shr.u32 %r355, %r90, 31;
add.s32 %r356, %r90, %r355;
shr.s32 %r455, %r356, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f306, [%rd30];
ld.shared.f32 %f307, [%rd23];
add.f32 %f308, %f306, %f307;
st.shared.f32 [%rd23], %f308;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r444, %r455;
$L__BB0_153:
setp.ge.u32 %p106, %r6, %r444;
@%p106 bra $L__BB0_155;
add.s32 %r357, %r444, %r39;
mul.wide.s32 %rd133, %r357, 4;
add.s64 %rd135, %rd43, %rd133;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd135];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r444, 1;
setp.gt.u32 %p107, %r444, 3;
mov.u32 %r444, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r359, %r39, 1;
mul.wide.u32 %rd136, %r359, 4;
add.s64 %rd31, %rd43, %rd136;
setp.ne.s32 %p108, %r6, 0;
mov.u32 %r445, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_159:
mov.b32 %r445, %f382;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f314, [%rd30];
ld.shared.f32 %f315, [%rd23];
add.f32 %f316, %f314, %f315;
st.shared.f32 [%rd23], %f316;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r446, %r455;
$L__BB0_164:
setp.ge.u32 %p112, %r6, %r446;
@%p112 bra $L__BB0_166;
add.s32 %r360, %r446, %r39;
mul.wide.s32 %rd138, %r360, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd140];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r446, 1;
setp.gt.u32 %p113, %r446, 3;
mov.u32 %r446, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r447, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_170:
mov.b32 %r447, %f383;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
add.s32 %r362, %r161, 1;
shr.u32 %r363, %r362, 31;
add.s32 %r364, %r362, %r363;
shr.s32 %r365, %r364, 1;
add.s32 %r366, %r4, %r365;
add.s32 %r367, %r366, -1;
div.s32 %r368, %r367, %r4;
setp.ge.s32 %p117, %r74, %r368;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r8, 1;
mul.lo.s32 %r369, %r4, %r74;
shl.b32 %r101, %r369, 1;
add.s32 %r370, %r100, %r101;
or.b32 %r371, %r370, 1;
setp.ge.s32 %p118, %r371, %r161;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd154, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r374, %r101, %r100;
mul.wide.s32 %rd142, %r374, 4;
add.s64 %rd141, %rd154, %rd142;
// begin inline asm
st.global.cs.v2.s32 [%rd141], {%r445,%r447};
// end inline asm
$L__BB0_175:
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p97 bra $L__BB0_181;
add.s32 %r376, %r161, 1;
shr.u32 %r377, %r376, 31;
add.s32 %r378, %r376, %r377;
shr.s32 %r379, %r378, 1;
add.s32 %r380, %r4, %r379;
add.s32 %r381, %r380, -1;
shl.b32 %r382, %r8, 1;
shl.b32 %r383, %r4, 1;
mad.lo.s32 %r384, %r383, %r74, %r382;
or.b32 %r385, %r384, 1;
setp.ge.s32 %p120, %r385, %r161;
div.s32 %r386, %r381, %r4;
setp.ge.s32 %p121, %r74, %r386;
or.pred %p8, %p121, %p120;
mul.lo.s32 %r387, %r4, %r74;
shl.b32 %r388, %r387, 1;
mad.lo.s32 %r389, %r161, %r6, %r388;
add.s32 %r449, %r389, %r382;
mul.lo.s32 %r103, %r161, %r3;
mov.u32 %r375, 0;
mov.f32 %f386, 0f00000000;
mov.u32 %r448, %r6;
mov.u32 %r450, %r375;
$L__BB0_177:
.pragma "nounroll";
mov.u32 %r451, %r375;
mov.u32 %r452, %r375;
@%p8 bra $L__BB0_180;
setp.ge.s32 %p122, %r448, %r9;
mov.u32 %r451, %r375;
mov.u32 %r452, %r375;
@%p122 bra $L__BB0_180;
mul.wide.s32 %rd144, %r449, 4;
add.s64 %rd143, %rd40, %rd144;
// begin inline asm
ld.volatile.global.v2.s32 {%r452,%r451}, [%rd143];
// end inline asm
$L__BB0_180:
mov.b32 %f326, %r452;
add.f32 %f386, %f386, %f326;
mov.b32 %f327, %r451;
add.f32 %f387, %f387, %f327;
add.s32 %r449, %r449, %r103;
add.s32 %r448, %r448, %r3;
add.s32 %r450, %r450, 1;
setp.lt.s32 %p123, %r450, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@%p104 bra $L__BB0_183;
ld.shared.f32 %f328, [%rd30];
ld.shared.f32 %f329, [%rd23];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd23], %f330;
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
mov.u32 %r453, %r455;
$L__BB0_185:
setp.ge.u32 %p126, %r6, %r453;
@%p126 bra $L__BB0_187;
add.s32 %r396, %r453, %r39;
mul.wide.s32 %rd145, %r396, 4;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd147];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_187:
bar.sync 0;
shr.u32 %r115, %r453, 1;
setp.gt.u32 %p127, %r453, 3;
mov.u32 %r453, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
mov.u32 %r454, 0;
@%p108 bra $L__BB0_192;
setp.lt.u32 %p129, %r3, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_191:
mov.b32 %r454, %f388;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@%p104 bra $L__BB0_194;
ld.shared.f32 %f336, [%rd30];
ld.shared.f32 %f337, [%rd23];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd23], %f338;
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
setp.ge.u32 %p132, %r6, %r455;
@%p132 bra $L__BB0_197;
add.s32 %r398, %r455, %r39;
mul.wide.s32 %rd148, %r398, 4;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd150];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_197:
bar.sync 0;
shr.u32 %r119, %r455, 1;
setp.gt.u32 %p133, %r455, 3;
mov.u32 %r455, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
mov.u32 %r456, 0;
@%p108 bra $L__BB0_202;
setp.lt.u32 %p135, %r3, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_201:
mov.b32 %r456, %f389;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
add.s32 %r400, %r161, 1;
shr.u32 %r401, %r400, 31;
add.s32 %r402, %r400, %r401;
shr.s32 %r403, %r402, 1;
add.s32 %r404, %r4, %r403;
add.s32 %r405, %r404, -1;
div.s32 %r406, %r405, %r4;
setp.ge.s32 %p137, %r74, %r406;
@%p137 bra $L__BB0_206;
shl.b32 %r122, %r8, 1;
mul.lo.s32 %r407, %r4, %r74;
shl.b32 %r123, %r407, 1;
add.s32 %r408, %r122, %r123;
or.b32 %r409, %r408, 1;
setp.ge.s32 %p138, %r409, %r161;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd153, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_16_cu_aab2bf46_723310nvfuser_16ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r412, %r123, %r122;
mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd151, %rd153, %rd152;
// begin inline asm
st.global.cs.v2.s32 [%rd151], {%r454,%r456};
// end inline asm
$L__BB0_206:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -30,11 +30,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<390>;
- .reg .b32 %r<459>;
+ .reg .b32 %r<457>;
.reg .f64 %fd<3>;
.reg .b64 %rd<156>;
ld.param.v2.u32 {%r160, %r161}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
@@ -50,110 +50,110 @@
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r196, %r161, 3;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 30;
add.s32 %r199, %r196, %r198;
- shr.s32 %r200, %r199, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r201, %r200, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r202, %r3, 2;
- mad.lo.s32 %r203, %r202, %r201, 15;
- and.b32 %r204, %r203, -16;
- cvt.u64.u32 %rd1, %r204;
- mul.lo.s32 %r205, %r3, %r200;
- shl.b32 %r206, %r205, 4;
- or.b32 %r207, %r206, 15;
- and.b32 %r4, %r207, -16;
- add.s32 %r208, %r207, %r4;
- and.b32 %r209, %r208, -16;
- cvt.s64.s32 %rd2, %r209;
+ shr.s32 %r2, %r199, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r200, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r201, %r4, 2;
+ mad.lo.s32 %r202, %r201, %r200, 15;
+ and.b32 %r203, %r202, -16;
+ cvt.u64.u32 %rd1, %r203;
+ mul.lo.s32 %r204, %r4, %r2;
+ shl.b32 %r205, %r204, 4;
+ or.b32 %r206, %r205, 15;
+ and.b32 %r5, %r206, -16;
+ add.s32 %r207, %r206, %r5;
+ and.b32 %r208, %r207, -16;
+ cvt.s64.s32 %rd2, %r208;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p9, %r5, %r200;
- shl.b32 %r6, %r5, 2;
- or.b32 %r210, %r6, 3;
- setp.lt.s32 %p10, %r210, %r161;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p9, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r209, %r7, 3;
+ setp.lt.s32 %p10, %r209, %r161;
and.pred %p1, %p10, %p9;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p11, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd45, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r211, smem_ptr; }
-
-
- shl.b32 %r214, %r5, 4;
- add.s32 %r212, %r211, %r214;
- mul.wide.s32 %rd47, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r210, smem_ptr; }
+
+
+ shl.b32 %r213, %r6, 4;
+ add.s32 %r211, %r210, %r213;
+ mul.wide.s32 %rd47, %r7, 4;
add.s64 %rd46, %rd36, %rd47;
- mov.u32 %r213, 0;
+ mov.u32 %r212, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r213, 0;
- cp.async.ca.shared.global [%r212], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r212, 0;
+ cp.async.ca.shared.global [%r211], [%rd46], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r215, %r3, 215;
- div.s32 %r216, %r215, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r217, %r8, %r216;
- add.s32 %r218, %r217, -1;
- div.s32 %r9, %r218, %r8;
- setp.gt.s32 %p13, %r9, 0;
+ add.s32 %r214, %r4, 215;
+ div.s32 %r215, %r214, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r216, %r9, %r215;
+ add.s32 %r217, %r216, -1;
+ div.s32 %r10, %r217, %r9;
+ setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r161;
- cvt.s64.s32 %rd48, %r4;
+ cvt.s64.s32 %rd48, %r5;
add.s64 %rd49, %rd1, %rd48;
add.s64 %rd51, %rd43, %rd1;
- mov.u32 %r220, %ctaid.y;
- mul.lo.s32 %r221, %r9, %r3;
- mul.lo.s32 %r10, %r221, %r220;
- shl.b32 %r222, %r7, 2;
- shl.b32 %r223, %r5, 4;
- mad.lo.s32 %r11, %r222, %r161, %r223;
- mul.lo.s32 %r224, %r161, %r7;
- cvt.s64.s32 %rd52, %r224;
- cvt.s64.s32 %rd53, %r6;
+ mov.u32 %r219, %ctaid.y;
+ mul.lo.s32 %r220, %r10, %r4;
+ mul.lo.s32 %r11, %r220, %r219;
+ mad.lo.s32 %r221, %r2, %r8, %r6;
+ shl.b32 %r12, %r221, 4;
+ mul.lo.s32 %r222, %r161, %r8;
+ cvt.s64.s32 %rd52, %r222;
+ cvt.s64.s32 %rd53, %r7;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r225, %r10, %r161;
- cvt.s64.s32 %rd6, %r225;
- mul.lo.s32 %r12, %r161, %r3;
- mul.lo.s32 %r13, %r9, %r220;
- add.s32 %r14, %r224, %r6;
+ mul.lo.s32 %r223, %r11, %r161;
+ cvt.s64.s32 %rd6, %r223;
+ mul.lo.s32 %r13, %r161, %r4;
+ mul.lo.s32 %r14, %r10, %r219;
+ shl.b32 %r224, %r8, 2;
+ mad.lo.s32 %r225, %r224, %r2, %r7;
add.s64 %rd54, %rd43, %rd49;
- mul.wide.s32 %rd55, %r14, 4;
+ mul.wide.s32 %rd55, %r225, 4;
add.s64 %rd7, %rd54, %rd55;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r226, %tid.z;
- mad.lo.s32 %r227, %r3, %r226, %r7;
- mad.lo.s32 %r15, %r227, %r2, %r5;
+ mad.lo.s32 %r227, %r4, %r226, %r8;
+ mad.lo.s32 %r15, %r227, %r3, %r6;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd8, %rd43, %rd56;
- clz.b32 %r228, %r2;
+ clz.b32 %r228, %r3;
mov.u32 %r229, 31;
sub.s32 %r230, %r229, %r228;
mov.u32 %r231, 1;
shl.b32 %r16, %r231, %r230;
- setp.lt.u32 %p14, %r5, %r16;
- add.s32 %r232, %r16, %r5;
- setp.lt.u32 %p15, %r232, %r2;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r232, %r16, %r6;
+ setp.lt.u32 %p15, %r232, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r233, %r15, %r16;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd9, %rd43, %rd57;
shr.u32 %r234, %r16, 31;
@@ -162,31 +162,31 @@
add.s64 %rd10, %rd51, %rd55;
add.s32 %r236, %r15, 1;
mul.wide.u32 %rd58, %r236, 4;
add.s64 %rd11, %rd43, %rd58;
add.s64 %rd59, %rd43, %rd4;
- mul.wide.s32 %rd60, %r6, 4;
+ mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd12, %rd59, %rd60;
mul.wide.s32 %rd61, %r227, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd35;
add.s64 %rd19, %rd44, %rd49;
- mov.u32 %r417, 0;
+ mov.u32 %r415, 0;
mov.f32 %f354, 0f00000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r239, smem_ptr; }
- add.s32 %r240, %r11, %r239;
+ add.s32 %r240, %r239, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r265, smem_ptr; }
- add.s32 %r266, %r11, %r265;
+ add.s32 %r266, %r265, %r12;
not.pred %p26, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
@@ -196,16 +196,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r237, %r417, %r3, %r7;
- add.s32 %r238, %r237, %r10;
+ mad.lo.s32 %r237, %r415, %r4, %r8;
+ add.s32 %r238, %r237, %r11;
setp.gt.s32 %p17, %r238, 215;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r242, %r12, %r417;
+ mul.lo.s32 %r242, %r13, %r415;
cvt.s64.s32 %rd65, %r242;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd32, %rd68;
@@ -224,53 +224,53 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r243, %r13, %r417;
- mad.lo.s32 %r244, %r243, %r3, %r7;
+ add.s32 %r243, %r14, %r415;
+ mad.lo.s32 %r244, %r243, %r4, %r8;
setp.lt.s32 %p19, %r244, 216;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
+ ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r253, %r13, %r417;
- mad.lo.s32 %r254, %r253, %r3, %r7;
+ add.s32 %r253, %r14, %r415;
+ mad.lo.s32 %r254, %r253, %r4, %r8;
setp.gt.s32 %p20, %r254, 215;
- mov.u32 %r418, 0;
- mov.u32 %r419, %r418;
- mov.u32 %r420, %r418;
- mov.u32 %r421, %r418;
+ mov.u32 %r416, 0;
+ mov.u32 %r417, %r416;
+ mov.u32 %r418, %r416;
+ mov.u32 %r419, %r416;
@%p20 bra $L__BB0_15;
- ld.shared.v4.u32 {%r418, %r419, %r420, %r421}, [%rd7];
+ ld.shared.v4.u32 {%r416, %r417, %r418, %r419}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r418, 0;
- mov.u32 %r419, %r418;
- mov.u32 %r420, %r418;
- mov.u32 %r421, %r418;
+ mov.u32 %r416, 0;
+ mov.u32 %r417, %r416;
+ mov.u32 %r418, %r416;
+ mov.u32 %r419, %r416;
$L__BB0_15:
- add.s32 %r263, %r13, %r417;
- mad.lo.s32 %r33, %r263, %r3, %r7;
- mov.b32 %f112, %r421;
+ add.s32 %r263, %r14, %r415;
+ mad.lo.s32 %r33, %r263, %r4, %r8;
+ mov.b32 %f112, %r419;
add.f32 %f369, %f369, %f112;
- mov.b32 %f113, %r420;
+ mov.b32 %f113, %r418;
add.f32 %f368, %f368, %f113;
- mov.b32 %f114, %r419;
+ mov.b32 %f114, %r417;
add.f32 %f367, %f367, %f114;
- mov.b32 %f115, %r418;
+ mov.b32 %f115, %r416;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p21, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p21 bra $L__BB0_17;
@@ -283,11 +283,11 @@
setp.lt.s32 %p22, %r33, 216;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
- mul.lo.s32 %r268, %r12, %r417;
+ mul.lo.s32 %r268, %r13, %r415;
cvt.s64.s32 %rd73, %r268;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd33, %rd76;
@@ -300,13 +300,13 @@
}
$L__BB0_19:
- add.s32 %r416, %r13, %r417;
- mad.lo.s32 %r415, %r416, %r3, %r7;
- setp.gt.s32 %p142, %r415, 215;
+ add.s32 %r414, %r14, %r415;
+ mad.lo.s32 %r413, %r414, %r4, %r8;
+ setp.gt.s32 %p142, %r413, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r269, %r33, %r174;
@@ -363,37 +363,37 @@
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
- mov.u32 %r422, %r17;
+ mov.u32 %r420, %r17;
$L__BB0_27:
- setp.ge.u32 %p28, %r5, %r422;
+ setp.ge.u32 %p28, %r6, %r420;
@%p28 bra $L__BB0_29;
- add.s32 %r270, %r422, %r15;
+ add.s32 %r270, %r420, %r15;
mul.wide.s32 %rd79, %r270, 4;
add.s64 %rd81, %rd43, %rd79;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd81];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
- shr.u32 %r35, %r422, 1;
- setp.gt.u32 %p29, %r422, 3;
- mov.u32 %r422, %r35;
+ shr.u32 %r35, %r420, 1;
+ setp.gt.u32 %p29, %r420, 3;
+ mov.u32 %r420, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p30, %r5, 0;
+ setp.ne.s32 %p30, %r6, 0;
mov.f32 %f360, 0f00000000;
@%p30 bra $L__BB0_33;
- setp.lt.u32 %p31, %r2, 2;
+ setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
@@ -413,36 +413,36 @@
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
- mov.u32 %r423, %r17;
+ mov.u32 %r421, %r17;
$L__BB0_37:
- setp.ge.u32 %p34, %r5, %r423;
+ setp.ge.u32 %p34, %r6, %r421;
@%p34 bra $L__BB0_39;
- add.s32 %r271, %r423, %r15;
+ add.s32 %r271, %r421, %r15;
mul.wide.s32 %rd82, %r271, 4;
add.s64 %rd84, %rd43, %rd82;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd84];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
- shr.u32 %r37, %r423, 1;
- setp.gt.u32 %p35, %r423, 3;
- mov.u32 %r423, %r37;
+ shr.u32 %r37, %r421, 1;
+ setp.gt.u32 %p35, %r421, 3;
+ mov.u32 %r421, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p30 bra $L__BB0_43;
- setp.lt.u32 %p37, %r2, 2;
+ setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
@@ -506,21 +506,20 @@
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r275, %f235;
- mad.lo.s32 %r276, %r417, %r3, %r10;
- mad.lo.s32 %r277, %r276, %r161, %r14;
- mul.wide.s32 %rd86, %r277, 4;
+ mad.lo.s32 %r276, %r33, %r161, %r7;
+ mul.wide.s32 %rd86, %r276, 4;
add.s64 %rd85, %rd37, %rd86;
st.global.cs.v4.s32 [%rd85], {%r272,%r273,%r274,%r275};
$L__BB0_49:
- add.s32 %r417, %r417, 1;
- setp.lt.s32 %p41, %r417, %r9;
+ add.s32 %r415, %r415, 1;
+ setp.lt.s32 %p41, %r415, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
@@ -531,31 +530,31 @@
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
- mov.u32 %r278, %tid.z;
- mad.lo.s32 %r279, %r3, %r278, %r7;
- mad.lo.s32 %r39, %r279, %r2, %r5;
+ mov.u32 %r277, %tid.z;
+ mad.lo.s32 %r278, %r4, %r277, %r8;
+ mad.lo.s32 %r39, %r278, %r3, %r6;
mul.wide.u32 %rd87, %r39, 4;
add.s64 %rd23, %rd43, %rd87;
- clz.b32 %r280, %r3;
- mov.u32 %r281, 31;
- sub.s32 %r282, %r281, %r280;
- mov.u32 %r283, 1;
- shl.b32 %r40, %r283, %r282;
- setp.lt.u32 %p42, %r7, %r40;
- add.s32 %r284, %r40, %r7;
- setp.lt.u32 %p43, %r284, %r3;
+ clz.b32 %r279, %r4;
+ mov.u32 %r280, 31;
+ sub.s32 %r281, %r280, %r279;
+ mov.u32 %r282, 1;
+ shl.b32 %r40, %r282, %r281;
+ setp.lt.u32 %p42, %r8, %r40;
+ add.s32 %r283, %r40, %r8;
+ setp.lt.u32 %p43, %r283, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r285, %r2, %r282;
- add.s32 %r286, %r39, %r285;
- mul.wide.s32 %rd89, %r286, 4;
+ shl.b32 %r284, %r3, %r281;
+ add.s32 %r285, %r39, %r284;
+ mul.wide.s32 %rd89, %r285, 4;
add.s64 %rd24, %rd43, %rd89;
- shr.u32 %r287, %r40, 31;
- add.s32 %r288, %r40, %r287;
- shr.s32 %r438, %r288, 1;
+ shr.u32 %r286, %r40, 31;
+ add.s32 %r287, %r40, %r286;
+ shr.s32 %r436, %r287, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
@@ -567,49 +566,49 @@
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
- mov.u32 %r424, %r438;
+ mov.u32 %r422, %r436;
$L__BB0_54:
- setp.ge.u32 %p46, %r7, %r424;
+ setp.ge.u32 %p46, %r8, %r422;
@%p46 bra $L__BB0_56;
- mad.lo.s32 %r289, %r424, %r2, %r39;
- mul.wide.s32 %rd90, %r289, 4;
+ mad.lo.s32 %r288, %r422, %r3, %r39;
+ mul.wide.s32 %rd90, %r288, 4;
add.s64 %rd92, %rd43, %rd90;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd92];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
- shr.u32 %r43, %r424, 1;
- setp.gt.u32 %p47, %r424, 3;
- mov.u32 %r424, %r43;
+ shr.u32 %r43, %r422, 1;
+ setp.gt.u32 %p47, %r422, 3;
+ mov.u32 %r422, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
- add.s32 %r291, %r39, %r2;
- mul.wide.u32 %rd93, %r291, 4;
+ add.s32 %r290, %r39, %r3;
+ mul.wide.u32 %rd93, %r290, 4;
add.s64 %rd25, %rd43, %rd93;
- setp.ne.s32 %p48, %r7, 0;
- mov.u32 %r425, 0;
+ setp.ne.s32 %p48, %r8, 0;
+ mov.u32 %r423, 0;
@%p48 bra $L__BB0_61;
- setp.lt.u32 %p49, %r3, 2;
+ setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
- mov.b32 %r425, %f370;
+ mov.b32 %r423, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@@ -622,45 +621,45 @@
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
- mov.u32 %r426, %r438;
+ mov.u32 %r424, %r436;
$L__BB0_65:
- setp.ge.u32 %p52, %r7, %r426;
+ setp.ge.u32 %p52, %r8, %r424;
@%p52 bra $L__BB0_67;
- mad.lo.s32 %r292, %r426, %r2, %r39;
- mul.wide.s32 %rd95, %r292, 4;
+ mad.lo.s32 %r291, %r424, %r3, %r39;
+ mul.wide.s32 %rd95, %r291, 4;
add.s64 %rd97, %rd43, %rd95;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd97];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r47, %r426, 1;
- setp.gt.u32 %p53, %r426, 3;
- mov.u32 %r426, %r47;
+ shr.u32 %r47, %r424, 1;
+ setp.gt.u32 %p53, %r424, 3;
+ mov.u32 %r424, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
- mov.u32 %r427, 0;
+ mov.u32 %r425, 0;
@%p48 bra $L__BB0_72;
- setp.lt.u32 %p55, %r3, 2;
+ setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
- mov.b32 %r427, %f371;
+ mov.b32 %r425, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@@ -673,45 +672,45 @@
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
- mov.u32 %r428, %r438;
+ mov.u32 %r426, %r436;
$L__BB0_76:
- setp.ge.u32 %p58, %r7, %r428;
+ setp.ge.u32 %p58, %r8, %r426;
@%p58 bra $L__BB0_78;
- mad.lo.s32 %r294, %r428, %r2, %r39;
- mul.wide.s32 %rd98, %r294, 4;
+ mad.lo.s32 %r293, %r426, %r3, %r39;
+ mul.wide.s32 %rd98, %r293, 4;
add.s64 %rd100, %rd43, %rd98;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd100];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
- shr.u32 %r51, %r428, 1;
- setp.gt.u32 %p59, %r428, 3;
- mov.u32 %r428, %r51;
+ shr.u32 %r51, %r426, 1;
+ setp.gt.u32 %p59, %r426, 3;
+ mov.u32 %r426, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
- mov.u32 %r429, 0;
+ mov.u32 %r427, 0;
@%p48 bra $L__BB0_83;
- setp.lt.u32 %p61, %r3, 2;
+ setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
- mov.b32 %r429, %f372;
+ mov.b32 %r427, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@@ -724,45 +723,45 @@
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
- mov.u32 %r430, %r438;
+ mov.u32 %r428, %r436;
$L__BB0_87:
- setp.ge.u32 %p64, %r7, %r430;
+ setp.ge.u32 %p64, %r8, %r428;
@%p64 bra $L__BB0_89;
- mad.lo.s32 %r296, %r430, %r2, %r39;
- mul.wide.s32 %rd101, %r296, 4;
+ mad.lo.s32 %r295, %r428, %r3, %r39;
+ mul.wide.s32 %rd101, %r295, 4;
add.s64 %rd103, %rd43, %rd101;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd103];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
- shr.u32 %r55, %r430, 1;
- setp.gt.u32 %p65, %r430, 3;
- mov.u32 %r430, %r55;
+ shr.u32 %r55, %r428, 1;
+ setp.gt.u32 %p65, %r428, 3;
+ mov.u32 %r428, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
- mov.u32 %r431, 0;
+ mov.u32 %r429, 0;
@%p48 bra $L__BB0_94;
- setp.lt.u32 %p67, %r3, 2;
+ setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
- mov.b32 %r431, %f373;
+ mov.b32 %r429, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@@ -775,45 +774,45 @@
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
- mov.u32 %r432, %r438;
+ mov.u32 %r430, %r436;
$L__BB0_98:
- setp.ge.u32 %p70, %r7, %r432;
+ setp.ge.u32 %p70, %r8, %r430;
@%p70 bra $L__BB0_100;
- mad.lo.s32 %r298, %r432, %r2, %r39;
- mul.wide.s32 %rd104, %r298, 4;
+ mad.lo.s32 %r297, %r430, %r3, %r39;
+ mul.wide.s32 %rd104, %r297, 4;
add.s64 %rd106, %rd43, %rd104;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd106];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
- shr.u32 %r59, %r432, 1;
- setp.gt.u32 %p71, %r432, 3;
- mov.u32 %r432, %r59;
+ shr.u32 %r59, %r430, 1;
+ setp.gt.u32 %p71, %r430, 3;
+ mov.u32 %r430, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
- mov.u32 %r433, 0;
+ mov.u32 %r431, 0;
@%p48 bra $L__BB0_105;
- setp.lt.u32 %p73, %r3, 2;
+ setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
- mov.b32 %r433, %f374;
+ mov.b32 %r431, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@@ -826,45 +825,45 @@
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
- mov.u32 %r434, %r438;
+ mov.u32 %r432, %r436;
$L__BB0_109:
- setp.ge.u32 %p76, %r7, %r434;
+ setp.ge.u32 %p76, %r8, %r432;
@%p76 bra $L__BB0_111;
- mad.lo.s32 %r300, %r434, %r2, %r39;
- mul.wide.s32 %rd107, %r300, 4;
+ mad.lo.s32 %r299, %r432, %r3, %r39;
+ mul.wide.s32 %rd107, %r299, 4;
add.s64 %rd109, %rd43, %rd107;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd109];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
- shr.u32 %r63, %r434, 1;
- setp.gt.u32 %p77, %r434, 3;
- mov.u32 %r434, %r63;
+ shr.u32 %r63, %r432, 1;
+ setp.gt.u32 %p77, %r432, 3;
+ mov.u32 %r432, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
- mov.u32 %r435, 0;
+ mov.u32 %r433, 0;
@%p48 bra $L__BB0_116;
- setp.lt.u32 %p79, %r3, 2;
+ setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
- mov.b32 %r435, %f375;
+ mov.b32 %r433, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@@ -877,45 +876,45 @@
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
- mov.u32 %r436, %r438;
+ mov.u32 %r434, %r436;
$L__BB0_120:
- setp.ge.u32 %p82, %r7, %r436;
+ setp.ge.u32 %p82, %r8, %r434;
@%p82 bra $L__BB0_122;
- mad.lo.s32 %r302, %r436, %r2, %r39;
- mul.wide.s32 %rd110, %r302, 4;
+ mad.lo.s32 %r301, %r434, %r3, %r39;
+ mul.wide.s32 %rd110, %r301, 4;
add.s64 %rd112, %rd43, %rd110;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd112];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
- shr.u32 %r67, %r436, 1;
- setp.gt.u32 %p83, %r436, 3;
- mov.u32 %r436, %r67;
+ shr.u32 %r67, %r434, 1;
+ setp.gt.u32 %p83, %r434, 3;
+ mov.u32 %r434, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
- mov.u32 %r437, 0;
+ mov.u32 %r435, 0;
@%p48 bra $L__BB0_127;
- setp.lt.u32 %p85, %r3, 2;
+ setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
- mov.b32 %r437, %f376;
+ mov.b32 %r435, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@@ -929,185 +928,184 @@
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
- setp.ge.u32 %p88, %r7, %r438;
+ setp.ge.u32 %p88, %r8, %r436;
@%p88 bra $L__BB0_132;
- mad.lo.s32 %r304, %r438, %r2, %r39;
- mul.wide.s32 %rd113, %r304, 4;
+ mad.lo.s32 %r303, %r436, %r3, %r39;
+ mul.wide.s32 %rd113, %r303, 4;
add.s64 %rd115, %rd43, %rd113;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd115];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
- shr.u32 %r71, %r438, 1;
- setp.gt.u32 %p89, %r438, 3;
- mov.u32 %r438, %r71;
+ shr.u32 %r71, %r436, 1;
+ setp.gt.u32 %p89, %r436, 3;
+ mov.u32 %r436, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
- mov.u32 %r439, 0;
+ mov.u32 %r437, 0;
@%p48 bra $L__BB0_137;
- setp.lt.u32 %p91, %r3, 2;
+ setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
- mov.b32 %r439, %f377;
+ mov.b32 %r437, %f377;
$L__BB0_137:
- setp.eq.s32 %p141, %r7, 0;
+ setp.eq.s32 %p141, %r8, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
- shl.b32 %r414, %r5, 2;
- mov.u32 %r314, %ctaid.y;
- mad.lo.s32 %r315, %r161, %r314, %r414;
- mul.wide.s32 %rd118, %r315, 4;
+ mov.u32 %r313, %ctaid.y;
+ mad.lo.s32 %r314, %r161, %r313, %r7;
+ mul.wide.s32 %rd118, %r314, 4;
add.s64 %rd116, %rd40, %rd118;
- st.volatile.global.v4.s32 [%rd116], {%r425,%r427,%r429,%r431};
+ st.volatile.global.v4.s32 [%rd116], {%r423,%r425,%r427,%r429};
add.s64 %rd117, %rd41, %rd118;
- st.volatile.global.v4.s32 [%rd117], {%r433,%r435,%r437,%r439};
+ st.volatile.global.v4.s32 [%rd117], {%r431,%r433,%r435,%r437};
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r316, %r5, %r7;
- or.b32 %r318, %r316, %r278;
- setp.ne.s32 %p92, %r318, 0;
+ or.b32 %r315, %r6, %r8;
+ or.b32 %r317, %r315, %r277;
+ setp.ne.s32 %p92, %r317, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd155, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd119, %rd155;
- mov.u32 %r319, %ctaid.x;
- mov.u32 %r320, %ctaid.z;
- mov.u32 %r321, %nctaid.x;
- mad.lo.s32 %r322, %r320, %r321, %r319;
- mul.wide.s32 %rd120, %r322, 8;
+ mov.u32 %r318, %ctaid.x;
+ mov.u32 %r319, %ctaid.z;
+ mov.u32 %r320, %nctaid.x;
+ mad.lo.s32 %r321, %r319, %r320, %r318;
+ mul.wide.s32 %rd120, %r321, 8;
add.s64 %rd28, %rd119, %rd120;
- add.s32 %r323, %r8, -1;
- setp.eq.s32 %p93, %r74, %r323;
- cvt.s64.s32 %rd121, %r8;
+ add.s32 %r322, %r9, -1;
+ setp.eq.s32 %p93, %r74, %r322;
+ cvt.s64.s32 %rd121, %r9;
mov.u64 %rd122, -9223372036854775807;
sub.s64 %rd123, %rd122, %rd121;
selp.b64 %rd124, %rd123, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd124;
ld.volatile.global.u64 %rd125, [%rd28];
xor.b64 %rd126, %rd125, %rd29;
setp.lt.s64 %p94, %rd126, 0;
@%p94 bra $L__BB0_143;
- mov.u32 %r440, 8;
+ mov.u32 %r438, 8;
$L__BB0_142:
- nanosleep.u32 %r440;
-
- setp.lt.u32 %p95, %r440, 256;
- selp.u32 %r326, 1, 0, %p95;
- shl.b32 %r440, %r440, %r326;
+ nanosleep.u32 %r438;
+
+ setp.lt.u32 %p95, %r438, 256;
+ selp.u32 %r325, 1, 0, %p95;
+ shl.b32 %r438, %r438, %r325;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.gt.s64 %p96, %rd128, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
- add.s32 %r327, %r8, %r2;
- add.s32 %r328, %r327, -1;
- div.s32 %r77, %r328, %r2;
+ add.s32 %r326, %r9, %r3;
+ add.s32 %r327, %r326, -1;
+ div.s32 %r77, %r327, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p97 bra $L__BB0_149;
- add.s32 %r330, %r161, 1;
- shr.u32 %r331, %r330, 31;
- add.s32 %r332, %r330, %r331;
- shr.s32 %r333, %r332, 1;
- add.s32 %r334, %r3, %r333;
- add.s32 %r335, %r334, -1;
- shl.b32 %r336, %r7, 1;
- shl.b32 %r337, %r3, 1;
- mad.lo.s32 %r338, %r337, %r74, %r336;
- or.b32 %r339, %r338, 1;
- setp.ge.s32 %p98, %r339, %r161;
- div.s32 %r340, %r335, %r3;
- setp.ge.s32 %p99, %r74, %r340;
+ add.s32 %r329, %r161, 1;
+ shr.u32 %r330, %r329, 31;
+ add.s32 %r331, %r329, %r330;
+ shr.s32 %r332, %r331, 1;
+ add.s32 %r333, %r4, %r332;
+ add.s32 %r334, %r333, -1;
+ shl.b32 %r335, %r8, 1;
+ shl.b32 %r336, %r4, 1;
+ mad.lo.s32 %r337, %r336, %r74, %r335;
+ or.b32 %r338, %r337, 1;
+ setp.ge.s32 %p98, %r338, %r161;
+ div.s32 %r339, %r334, %r4;
+ setp.ge.s32 %p99, %r74, %r339;
or.pred %p6, %p99, %p98;
- mul.lo.s32 %r341, %r3, %r74;
- shl.b32 %r342, %r341, 1;
- mad.lo.s32 %r343, %r161, %r5, %r342;
- add.s32 %r442, %r343, %r336;
- mul.lo.s32 %r79, %r161, %r2;
- mov.u32 %r329, 0;
+ mul.lo.s32 %r340, %r4, %r74;
+ shl.b32 %r341, %r340, 1;
+ mad.lo.s32 %r342, %r161, %r6, %r341;
+ add.s32 %r440, %r342, %r335;
+ mul.lo.s32 %r79, %r161, %r3;
+ mov.u32 %r328, 0;
mov.f32 %f380, 0f00000000;
- mov.u32 %r441, %r5;
- mov.u32 %r443, %r329;
+ mov.u32 %r439, %r6;
+ mov.u32 %r441, %r328;
$L__BB0_145:
.pragma "nounroll";
- mov.u32 %r444, %r329;
- mov.u32 %r445, %r329;
+ mov.u32 %r442, %r328;
+ mov.u32 %r443, %r328;
@%p6 bra $L__BB0_148;
- setp.ge.s32 %p100, %r441, %r8;
- mov.u32 %r444, %r329;
- mov.u32 %r445, %r329;
+ setp.ge.s32 %p100, %r439, %r9;
+ mov.u32 %r442, %r328;
+ mov.u32 %r443, %r328;
@%p100 bra $L__BB0_148;
- mul.wide.s32 %rd130, %r442, 4;
+ mul.wide.s32 %rd130, %r440, 4;
add.s64 %rd129, %rd41, %rd130;
- ld.volatile.global.v2.s32 {%r445,%r444}, [%rd129];
+ ld.volatile.global.v2.s32 {%r443,%r442}, [%rd129];
$L__BB0_148:
- mov.b32 %f304, %r445;
+ mov.b32 %f304, %r443;
add.f32 %f380, %f380, %f304;
- mov.b32 %f305, %r444;
+ mov.b32 %f305, %r442;
add.f32 %f381, %f381, %f305;
- add.s32 %r442, %r442, %r79;
- add.s32 %r441, %r441, %r2;
- add.s32 %r443, %r443, 1;
- setp.lt.s32 %p101, %r443, %r77;
+ add.s32 %r440, %r440, %r79;
+ add.s32 %r439, %r439, %r3;
+ add.s32 %r441, %r441, 1;
+ setp.lt.s32 %p101, %r441, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
- clz.b32 %r350, %r2;
- mov.u32 %r351, 31;
- sub.s32 %r352, %r351, %r350;
- mov.u32 %r353, 1;
- shl.b32 %r90, %r353, %r352;
- setp.lt.u32 %p102, %r5, %r90;
- add.s32 %r354, %r90, %r5;
- setp.lt.u32 %p103, %r354, %r2;
+ clz.b32 %r349, %r3;
+ mov.u32 %r350, 31;
+ sub.s32 %r351, %r350, %r349;
+ mov.u32 %r352, 1;
+ shl.b32 %r90, %r352, %r351;
+ setp.lt.u32 %p102, %r6, %r90;
+ add.s32 %r353, %r90, %r6;
+ setp.lt.u32 %p103, %r353, %r3;
and.pred %p7, %p102, %p103;
- add.s32 %r355, %r39, %r90;
- mul.wide.s32 %rd131, %r355, 4;
+ add.s32 %r354, %r39, %r90;
+ mul.wide.s32 %rd131, %r354, 4;
add.s64 %rd30, %rd43, %rd131;
- shr.u32 %r356, %r90, 31;
- add.s32 %r357, %r90, %r356;
- shr.s32 %r457, %r357, 1;
+ shr.u32 %r355, %r90, 31;
+ add.s32 %r356, %r90, %r355;
+ shr.s32 %r455, %r356, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
@@ -1119,49 +1117,49 @@
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
- mov.u32 %r446, %r457;
+ mov.u32 %r444, %r455;
$L__BB0_153:
- setp.ge.u32 %p106, %r5, %r446;
+ setp.ge.u32 %p106, %r6, %r444;
@%p106 bra $L__BB0_155;
- add.s32 %r358, %r446, %r39;
- mul.wide.s32 %rd133, %r358, 4;
+ add.s32 %r357, %r444, %r39;
+ mul.wide.s32 %rd133, %r357, 4;
add.s64 %rd135, %rd43, %rd133;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd135];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_155:
bar.sync 0;
- shr.u32 %r93, %r446, 1;
- setp.gt.u32 %p107, %r446, 3;
- mov.u32 %r446, %r93;
+ shr.u32 %r93, %r444, 1;
+ setp.gt.u32 %p107, %r444, 3;
+ mov.u32 %r444, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
- add.s32 %r360, %r39, 1;
- mul.wide.u32 %rd136, %r360, 4;
+ add.s32 %r359, %r39, 1;
+ mul.wide.u32 %rd136, %r359, 4;
add.s64 %rd31, %rd43, %rd136;
- setp.ne.s32 %p108, %r5, 0;
- mov.u32 %r447, 0;
+ setp.ne.s32 %p108, %r6, 0;
+ mov.u32 %r445, 0;
@%p108 bra $L__BB0_160;
- setp.lt.u32 %p109, %r2, 2;
+ setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_159:
- mov.b32 %r447, %f382;
+ mov.b32 %r445, %f382;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@@ -1174,131 +1172,131 @@
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
- mov.u32 %r448, %r457;
+ mov.u32 %r446, %r455;
$L__BB0_164:
- setp.ge.u32 %p112, %r5, %r448;
+ setp.ge.u32 %p112, %r6, %r446;
@%p112 bra $L__BB0_166;
- add.s32 %r361, %r448, %r39;
- mul.wide.s32 %rd138, %r361, 4;
+ add.s32 %r360, %r446, %r39;
+ mul.wide.s32 %rd138, %r360, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd140];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_166:
bar.sync 0;
- shr.u32 %r97, %r448, 1;
- setp.gt.u32 %p113, %r448, 3;
- mov.u32 %r448, %r97;
+ shr.u32 %r97, %r446, 1;
+ setp.gt.u32 %p113, %r446, 3;
+ mov.u32 %r446, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
- mov.u32 %r449, 0;
+ mov.u32 %r447, 0;
@%p108 bra $L__BB0_171;
- setp.lt.u32 %p115, %r2, 2;
+ setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_170:
- mov.b32 %r449, %f383;
+ mov.b32 %r447, %f383;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
- add.s32 %r363, %r161, 1;
- shr.u32 %r364, %r363, 31;
- add.s32 %r365, %r363, %r364;
- shr.s32 %r366, %r365, 1;
- add.s32 %r367, %r3, %r366;
- add.s32 %r368, %r367, -1;
- div.s32 %r369, %r368, %r3;
- setp.ge.s32 %p117, %r74, %r369;
+ add.s32 %r362, %r161, 1;
+ shr.u32 %r363, %r362, 31;
+ add.s32 %r364, %r362, %r363;
+ shr.s32 %r365, %r364, 1;
+ add.s32 %r366, %r4, %r365;
+ add.s32 %r367, %r366, -1;
+ div.s32 %r368, %r367, %r4;
+ setp.ge.s32 %p117, %r74, %r368;
@%p117 bra $L__BB0_175;
- shl.b32 %r100, %r7, 1;
- mul.lo.s32 %r370, %r3, %r74;
- shl.b32 %r101, %r370, 1;
- add.s32 %r371, %r100, %r101;
- or.b32 %r372, %r371, 1;
- setp.ge.s32 %p118, %r372, %r161;
+ shl.b32 %r100, %r8, 1;
+ mul.lo.s32 %r369, %r4, %r74;
+ shl.b32 %r101, %r369, 1;
+ add.s32 %r370, %r100, %r101;
+ or.b32 %r371, %r370, 1;
+ setp.ge.s32 %p118, %r371, %r161;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd154, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r375, %r101, %r100;
- mul.wide.s32 %rd142, %r375, 4;
+ add.s32 %r374, %r101, %r100;
+ mul.wide.s32 %rd142, %r374, 4;
add.s64 %rd141, %rd154, %rd142;
- st.global.cs.v2.s32 [%rd141], {%r447,%r449};
+ st.global.cs.v2.s32 [%rd141], {%r445,%r447};
$L__BB0_175:
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p97 bra $L__BB0_181;
- add.s32 %r377, %r161, 1;
- shr.u32 %r378, %r377, 31;
- add.s32 %r379, %r377, %r378;
- shr.s32 %r380, %r379, 1;
- add.s32 %r381, %r3, %r380;
- add.s32 %r382, %r381, -1;
- shl.b32 %r383, %r7, 1;
- shl.b32 %r384, %r3, 1;
- mad.lo.s32 %r385, %r384, %r74, %r383;
- or.b32 %r386, %r385, 1;
- setp.ge.s32 %p120, %r386, %r161;
- div.s32 %r387, %r382, %r3;
- setp.ge.s32 %p121, %r74, %r387;
+ add.s32 %r376, %r161, 1;
+ shr.u32 %r377, %r376, 31;
+ add.s32 %r378, %r376, %r377;
+ shr.s32 %r379, %r378, 1;
+ add.s32 %r380, %r4, %r379;
+ add.s32 %r381, %r380, -1;
+ shl.b32 %r382, %r8, 1;
+ shl.b32 %r383, %r4, 1;
+ mad.lo.s32 %r384, %r383, %r74, %r382;
+ or.b32 %r385, %r384, 1;
+ setp.ge.s32 %p120, %r385, %r161;
+ div.s32 %r386, %r381, %r4;
+ setp.ge.s32 %p121, %r74, %r386;
or.pred %p8, %p121, %p120;
- mul.lo.s32 %r388, %r3, %r74;
- shl.b32 %r389, %r388, 1;
- mad.lo.s32 %r390, %r161, %r5, %r389;
- add.s32 %r451, %r390, %r383;
- mul.lo.s32 %r103, %r161, %r2;
- mov.u32 %r376, 0;
+ mul.lo.s32 %r387, %r4, %r74;
+ shl.b32 %r388, %r387, 1;
+ mad.lo.s32 %r389, %r161, %r6, %r388;
+ add.s32 %r449, %r389, %r382;
+ mul.lo.s32 %r103, %r161, %r3;
+ mov.u32 %r375, 0;
mov.f32 %f386, 0f00000000;
- mov.u32 %r450, %r5;
- mov.u32 %r452, %r376;
+ mov.u32 %r448, %r6;
+ mov.u32 %r450, %r375;
$L__BB0_177:
.pragma "nounroll";
- mov.u32 %r453, %r376;
- mov.u32 %r454, %r376;
+ mov.u32 %r451, %r375;
+ mov.u32 %r452, %r375;
@%p8 bra $L__BB0_180;
- setp.ge.s32 %p122, %r450, %r8;
- mov.u32 %r453, %r376;
- mov.u32 %r454, %r376;
+ setp.ge.s32 %p122, %r448, %r9;
+ mov.u32 %r451, %r375;
+ mov.u32 %r452, %r375;
@%p122 bra $L__BB0_180;
- mul.wide.s32 %rd144, %r451, 4;
+ mul.wide.s32 %rd144, %r449, 4;
add.s64 %rd143, %rd40, %rd144;
- ld.volatile.global.v2.s32 {%r454,%r453}, [%rd143];
+ ld.volatile.global.v2.s32 {%r452,%r451}, [%rd143];
$L__BB0_180:
- mov.b32 %f326, %r454;
+ mov.b32 %f326, %r452;
add.f32 %f386, %f386, %f326;
- mov.b32 %f327, %r453;
+ mov.b32 %f327, %r451;
add.f32 %f387, %f387, %f327;
- add.s32 %r451, %r451, %r103;
- add.s32 %r450, %r450, %r2;
- add.s32 %r452, %r452, 1;
- setp.lt.s32 %p123, %r452, %r77;
+ add.s32 %r449, %r449, %r103;
+ add.s32 %r448, %r448, %r3;
+ add.s32 %r450, %r450, 1;
+ setp.lt.s32 %p123, %r450, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@@ -1311,45 +1309,45 @@
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
- mov.u32 %r455, %r457;
+ mov.u32 %r453, %r455;
$L__BB0_185:
- setp.ge.u32 %p126, %r5, %r455;
+ setp.ge.u32 %p126, %r6, %r453;
@%p126 bra $L__BB0_187;
- add.s32 %r397, %r455, %r39;
- mul.wide.s32 %rd145, %r397, 4;
+ add.s32 %r396, %r453, %r39;
+ mul.wide.s32 %rd145, %r396, 4;
add.s64 %rd147, %rd43, %rd145;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd147];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_187:
bar.sync 0;
- shr.u32 %r115, %r455, 1;
- setp.gt.u32 %p127, %r455, 3;
- mov.u32 %r455, %r115;
+ shr.u32 %r115, %r453, 1;
+ setp.gt.u32 %p127, %r453, 3;
+ mov.u32 %r453, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
- mov.u32 %r456, 0;
+ mov.u32 %r454, 0;
@%p108 bra $L__BB0_192;
- setp.lt.u32 %p129, %r2, 2;
+ setp.lt.u32 %p129, %r3, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_191:
- mov.b32 %r456, %f388;
+ mov.b32 %r454, %f388;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@@ -1363,71 +1361,71 @@
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
- setp.ge.u32 %p132, %r5, %r457;
+ setp.ge.u32 %p132, %r6, %r455;
@%p132 bra $L__BB0_197;
- add.s32 %r399, %r457, %r39;
- mul.wide.s32 %rd148, %r399, 4;
+ add.s32 %r398, %r455, %r39;
+ mul.wide.s32 %rd148, %r398, 4;
add.s64 %rd150, %rd43, %rd148;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd150];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_197:
bar.sync 0;
- shr.u32 %r119, %r457, 1;
- setp.gt.u32 %p133, %r457, 3;
- mov.u32 %r457, %r119;
+ shr.u32 %r119, %r455, 1;
+ setp.gt.u32 %p133, %r455, 3;
+ mov.u32 %r455, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
- mov.u32 %r458, 0;
+ mov.u32 %r456, 0;
@%p108 bra $L__BB0_202;
- setp.lt.u32 %p135, %r2, 2;
+ setp.lt.u32 %p135, %r3, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_201:
- mov.b32 %r458, %f389;
+ mov.b32 %r456, %f389;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
- add.s32 %r401, %r161, 1;
- shr.u32 %r402, %r401, 31;
- add.s32 %r403, %r401, %r402;
- shr.s32 %r404, %r403, 1;
- add.s32 %r405, %r3, %r404;
- add.s32 %r406, %r405, -1;
- div.s32 %r407, %r406, %r3;
- setp.ge.s32 %p137, %r74, %r407;
+ add.s32 %r400, %r161, 1;
+ shr.u32 %r401, %r400, 31;
+ add.s32 %r402, %r400, %r401;
+ shr.s32 %r403, %r402, 1;
+ add.s32 %r404, %r4, %r403;
+ add.s32 %r405, %r404, -1;
+ div.s32 %r406, %r405, %r4;
+ setp.ge.s32 %p137, %r74, %r406;
@%p137 bra $L__BB0_206;
- shl.b32 %r122, %r7, 1;
- mul.lo.s32 %r408, %r3, %r74;
- shl.b32 %r123, %r408, 1;
- add.s32 %r409, %r122, %r123;
- or.b32 %r410, %r409, 1;
- setp.ge.s32 %p138, %r410, %r161;
+ shl.b32 %r122, %r8, 1;
+ mul.lo.s32 %r407, %r4, %r74;
+ shl.b32 %r123, %r407, 1;
+ add.s32 %r408, %r122, %r123;
+ or.b32 %r409, %r408, 1;
+ setp.ge.s32 %p138, %r409, %r161;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd153, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r413, %r123, %r122;
- mul.wide.s32 %rd152, %r413, 4;
+ add.s32 %r412, %r123, %r122;
+ mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd151, %rd153, %rd152;
- st.global.cs.v2.s32 [%rd151], {%r456,%r458};
+ st.global.cs.v2.s32 [%rd151], {%r454,%r456};
$L__BB0_206:
ret;
9: CombinedSchedulerTest.LayerNormBackward/dtype_float_batch_216_hidden_576
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
blockReduce<true, false, false, true>(T37[i15], T56[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
blockReduce<true, false, false, true>(T36[i19], T51[i19], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
blockReduce<true, false, false, true>(T37[i15], T56[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
blockReduce<true, false, false, true>(T36[i19], T51[i19], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<140>;
.reg .f32 %f<390>;
.reg .b32 %r<436>;
.reg .f64 %fd<3>;
.reg .b64 %rd<157>;
ld.param.v2.u32 {%r170, %r171}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r180, %r181}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r184, %r185}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd39, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd38, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r206, %r171, 3;
shr.s32 %r207, %r206, 31;
shr.u32 %r208, %r207, 30;
add.s32 %r209, %r206, %r208;
shr.s32 %r210, %r209, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r211, %r210, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r212, %r3, 2;
mad.lo.s32 %r213, %r212, %r211, 15;
and.b32 %r214, %r213, -16;
cvt.u64.u32 %rd1, %r214;
mul.lo.s32 %r215, %r3, %r210;
shl.b32 %r216, %r215, 4;
or.b32 %r217, %r216, 15;
and.b32 %r4, %r217, -16;
add.s32 %r218, %r217, %r4;
and.b32 %r219, %r218, -16;
cvt.s64.s32 %rd2, %r219;
mov.u64 %rd45, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_103395arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p7, %r5, %r210;
shl.b32 %r6, %r5, 2;
or.b32 %r220, %r6, 3;
setp.lt.s32 %p8, %r220, %r171;
and.pred %p1, %p8, %p7;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p9, %r7, 0;
and.pred %p2, %p9, %p1;
not.pred %p10, %p2;
@%p10 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r221, smem_ptr; }
// end inline asm
shl.b32 %r224, %r5, 4;
add.s32 %r222, %r221, %r224;
mul.wide.s32 %rd49, %r6, 4;
add.s64 %rd48, %rd38, %rd49;
mov.u32 %r223, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r223, 0;
cp.async.ca.shared.global [%r222], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r225, %r3, 215;
div.s32 %r226, %r225, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r227, %r8, %r226;
add.s32 %r228, %r227, -1;
div.s32 %r9, %r228, %r8;
setp.gt.s32 %p11, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p11 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r171;
cvt.s64.s32 %rd50, %r4;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r230, %ctaid.y;
mul.lo.s32 %r231, %r9, %r3;
mul.lo.s32 %r10, %r231, %r230;
shl.b32 %r232, %r7, 2;
shl.b32 %r233, %r5, 4;
mad.lo.s32 %r11, %r232, %r171, %r233;
mul.lo.s32 %r234, %r171, %r7;
cvt.s64.s32 %rd54, %r234;
cvt.s64.s32 %rd55, %r6;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r235, %r10, %r171;
cvt.s64.s32 %rd6, %r235;
mul.lo.s32 %r12, %r171, %r3;
mul.lo.s32 %r13, %r9, %r230;
add.s32 %r14, %r234, %r6;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r14, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r236, %tid.z;
mad.lo.s32 %r237, %r3, %r236, %r7;
mad.lo.s32 %r15, %r237, %r2, %r5;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r238, %r2;
mov.u32 %r239, 31;
sub.s32 %r240, %r239, %r238;
mov.u32 %r241, 1;
shl.b32 %r16, %r241, %r240;
setp.lt.u32 %p12, %r5, %r16;
add.s32 %r242, %r16, %r5;
setp.lt.u32 %p13, %r242, %r2;
and.pred %p3, %p12, %p13;
add.s32 %r243, %r15, %r16;
mul.wide.s32 %rd59, %r243, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r244, %r16, 31;
add.s32 %r245, %r16, %r244;
shr.s32 %r17, %r245, 1;
add.s64 %rd10, %rd53, %rd57;
add.s32 %r246, %r15, 1;
mul.wide.u32 %rd60, %r246, 4;
add.s64 %rd11, %rd45, %rd60;
add.s64 %rd61, %rd45, %rd4;
mul.wide.s32 %rd62, %r6, 4;
add.s64 %rd12, %rd61, %rd62;
mul.wide.s32 %rd63, %r237, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd37;
add.s64 %rd19, %rd46, %rd51;
mov.u32 %r394, 0;
mov.f32 %f354, 0f00000000;
not.pred %p14, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r249, smem_ptr; }
// end inline asm
add.s32 %r250, %r11, %r249;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r275, smem_ptr; }
// end inline asm
add.s32 %r276, %r11, %r275;
not.pred %p24, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_5:
.pragma "nounroll";
@%p14 bra $L__BB0_8;
mad.lo.s32 %r247, %r394, %r3, %r7;
add.s32 %r248, %r247, %r10;
setp.gt.s32 %p15, %r248, 215;
@%p15 bra $L__BB0_8;
mul.lo.s32 %r252, %r12, %r394;
cvt.s64.s32 %rd67, %r252;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd34, %rd70;
mov.u32 %r251, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r251, 0;
cp.async.ca.shared.global [%r250], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p14 bra $L__BB0_10;
add.s32 %r253, %r13, %r394;
mad.lo.s32 %r254, %r253, %r3, %r7;
setp.lt.s32 %p17, %r254, 216;
@%p17 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r395, %r396, %r397, %r398}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r263, %r13, %r394;
mad.lo.s32 %r264, %r263, %r3, %r7;
setp.gt.s32 %p18, %r264, 215;
mov.u32 %r395, 0;
mov.u32 %r396, %r395;
mov.u32 %r397, %r395;
mov.u32 %r398, %r395;
@%p18 bra $L__BB0_15;
ld.shared.v4.u32 {%r395, %r396, %r397, %r398}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r395, 0;
mov.u32 %r396, %r395;
mov.u32 %r397, %r395;
mov.u32 %r398, %r395;
$L__BB0_15:
add.s32 %r273, %r13, %r394;
mad.lo.s32 %r33, %r273, %r3, %r7;
mov.b32 %f112, %r398;
add.f32 %f369, %f369, %f112;
mov.b32 %f113, %r397;
add.f32 %f368, %f368, %f113;
mov.b32 %f114, %r396;
add.f32 %f367, %f367, %f114;
mov.b32 %f115, %r395;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p19, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p19 bra $L__BB0_17;
mul.lo.s32 %r274, %r33, %r180;
mul.wide.s32 %rd71, %r274, 4;
add.s64 %rd72, %rd16, %rd71;
ld.global.f32 %f352, [%rd72];
$L__BB0_17:
setp.lt.s32 %p20, %r33, 216;
and.pred %p4, %p1, %p20;
not.pred %p21, %p4;
@%p21 bra $L__BB0_19;
mul.lo.s32 %r278, %r12, %r394;
cvt.s64.s32 %rd75, %r278;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd35, %rd78;
mov.u32 %r277, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r277, 0;
cp.async.ca.shared.global [%r276], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r393, %r13, %r394;
mad.lo.s32 %r392, %r393, %r3, %r7;
setp.gt.s32 %p139, %r392, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p139 bra $L__BB0_21;
mul.lo.s32 %r279, %r33, %r184;
mul.wide.s32 %rd79, %r279, 4;
add.s64 %rd80, %rd17, %rd79;
ld.global.f32 %f353, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f359, %f358;
@%p21 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd7];
mul.f32 %f129, %f119, %f124;
add.f32 %f130, %f129, 0f00000000;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
sub.f32 %f136, %f131, %f352;
mul.f32 %f137, %f353, %f136;
fma.rn.f32 %f138, %f129, %f137, 0f00000000;
fma.rn.f32 %f354, %f137, %f124, %f354;
mul.f32 %f141, %f120, %f125;
add.f32 %f142, %f130, %f141;
sub.f32 %f144, %f132, %f352;
mul.f32 %f145, %f353, %f144;
fma.rn.f32 %f146, %f141, %f145, %f138;
fma.rn.f32 %f355, %f145, %f125, %f355;
mul.f32 %f149, %f121, %f126;
add.f32 %f150, %f142, %f149;
sub.f32 %f152, %f133, %f352;
mul.f32 %f153, %f353, %f152;
fma.rn.f32 %f154, %f149, %f153, %f146;
fma.rn.f32 %f356, %f153, %f126, %f356;
mul.f32 %f157, %f122, %f127;
add.f32 %f359, %f150, %f157;
sub.f32 %f159, %f134, %f352;
mul.f32 %f160, %f353, %f159;
fma.rn.f32 %f358, %f157, %f160, %f154;
fma.rn.f32 %f357, %f160, %f127, %f357;
$L__BB0_23:
st.shared.f32 [%rd8], %f359;
bar.sync 0;
@%p24 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_30;
mov.u32 %r399, %r17;
$L__BB0_27:
setp.ge.u32 %p26, %r5, %r399;
@%p26 bra $L__BB0_29;
add.s32 %r280, %r399, %r15;
mul.wide.s32 %rd81, %r280, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd83];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r399, 1;
setp.gt.u32 %p27, %r399, 3;
mov.u32 %r399, %r35;
@%p27 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p28, %r5, 0;
mov.f32 %f360, 0f00000000;
@%p28 bra $L__BB0_33;
setp.lt.u32 %p29, %r2, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p29 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f360, %f360, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f358;
bar.sync 0;
@%p24 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p136, %r16, 4;
bar.sync 0;
@%p136 bra $L__BB0_40;
mov.u32 %r400, %r17;
$L__BB0_37:
setp.ge.u32 %p32, %r5, %r400;
@%p32 bra $L__BB0_39;
add.s32 %r281, %r400, %r15;
mul.wide.s32 %rd84, %r281, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd86];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r400, 1;
setp.gt.u32 %p33, %r400, 3;
mov.u32 %r400, %r37;
@%p33 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p28 bra $L__BB0_43;
setp.lt.u32 %p35, %r2, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p35 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f361, %f361, %f178;
$L__BB0_43:
bar.sync 0;
@%p28 bra $L__BB0_45;
st.shared.f32 [%rd13], %f360;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p28 bra $L__BB0_47;
st.shared.f32 [%rd13], %f361;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p21 bra $L__BB0_49;
mul.f32 %f179, %f353, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd12];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd10];
sub.f32 %f197, %f192, %f352;
mul.f32 %f198, %f353, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r282, %f202;
mul.f32 %f205, %f181, %f186;
mul.f32 %f206, %f205, %f2;
sub.f32 %f208, %f193, %f352;
mul.f32 %f209, %f353, %f208;
sub.f32 %f210, %f206, %f37;
mul.f32 %f211, %f38, %f209;
sub.f32 %f212, %f210, %f211;
mul.f32 %f213, %f179, %f212;
mov.b32 %r283, %f213;
mul.f32 %f216, %f182, %f187;
mul.f32 %f217, %f216, %f2;
sub.f32 %f219, %f194, %f352;
mul.f32 %f220, %f353, %f219;
sub.f32 %f221, %f217, %f37;
mul.f32 %f222, %f38, %f220;
sub.f32 %f223, %f221, %f222;
mul.f32 %f224, %f179, %f223;
mov.b32 %r284, %f224;
mul.f32 %f227, %f183, %f188;
mul.f32 %f228, %f227, %f2;
sub.f32 %f230, %f195, %f352;
mul.f32 %f231, %f353, %f230;
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r285, %f235;
mad.lo.s32 %r286, %r394, %r3, %r10;
mad.lo.s32 %r287, %r286, %r171, %r14;
mul.wide.s32 %rd88, %r287, 4;
add.s64 %rd87, %rd39, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r282,%r283,%r284,%r285};
// end inline asm
$L__BB0_49:
add.s32 %r394, %r394, 1;
setp.lt.s32 %p39, %r394, %r9;
@%p39 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
mov.u32 %r288, %tid.z;
mad.lo.s32 %r289, %r3, %r288, %r7;
mad.lo.s32 %r39, %r289, %r2, %r5;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
clz.b32 %r290, %r3;
mov.u32 %r291, 31;
sub.s32 %r292, %r291, %r290;
mov.u32 %r293, 1;
shl.b32 %r40, %r293, %r292;
setp.lt.u32 %p40, %r7, %r40;
add.s32 %r294, %r40, %r7;
setp.lt.u32 %p41, %r294, %r3;
and.pred %p5, %p40, %p41;
shl.b32 %r295, %r2, %r292;
add.s32 %r296, %r39, %r295;
mul.wide.s32 %rd91, %r296, 4;
add.s64 %rd24, %rd45, %rd91;
shr.u32 %r297, %r40, 31;
add.s32 %r298, %r40, %r297;
shr.s32 %r415, %r298, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p42, %p5;
@%p42 bra $L__BB0_52;
ld.shared.f32 %f236, [%rd24];
ld.shared.f32 %f237, [%rd23];
add.f32 %f238, %f236, %f237;
st.shared.f32 [%rd23], %f238;
$L__BB0_52:
setp.lt.s32 %p43, %r40, 4;
bar.sync 0;
@%p43 bra $L__BB0_57;
mov.u32 %r401, %r415;
$L__BB0_54:
setp.ge.u32 %p44, %r7, %r401;
@%p44 bra $L__BB0_56;
mad.lo.s32 %r299, %r401, %r2, %r39;
mul.wide.s32 %rd92, %r299, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd94];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r401, 1;
setp.gt.u32 %p45, %r401, 3;
mov.u32 %r401, %r43;
@%p45 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r301, %r39, %r2;
mul.wide.u32 %rd95, %r301, 4;
add.s64 %rd25, %rd45, %rd95;
setp.ne.s32 %p46, %r7, 0;
mov.u32 %r402, 0;
@%p46 bra $L__BB0_61;
setp.lt.u32 %p47, %r3, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p47 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
mov.b32 %r402, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@%p42 bra $L__BB0_63;
ld.shared.f32 %f244, [%rd24];
ld.shared.f32 %f245, [%rd23];
add.f32 %f246, %f244, %f245;
st.shared.f32 [%rd23], %f246;
$L__BB0_63:
bar.sync 0;
@%p43 bra $L__BB0_68;
mov.u32 %r403, %r415;
$L__BB0_65:
setp.ge.u32 %p50, %r7, %r403;
@%p50 bra $L__BB0_67;
mad.lo.s32 %r302, %r403, %r2, %r39;
mul.wide.s32 %rd97, %r302, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd99];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r403, 1;
setp.gt.u32 %p51, %r403, 3;
mov.u32 %r403, %r47;
@%p51 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r404, 0;
@%p46 bra $L__BB0_72;
setp.lt.u32 %p53, %r3, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p53 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
mov.b32 %r404, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@%p42 bra $L__BB0_74;
ld.shared.f32 %f252, [%rd24];
ld.shared.f32 %f253, [%rd23];
add.f32 %f254, %f252, %f253;
st.shared.f32 [%rd23], %f254;
$L__BB0_74:
bar.sync 0;
@%p43 bra $L__BB0_79;
mov.u32 %r405, %r415;
$L__BB0_76:
setp.ge.u32 %p56, %r7, %r405;
@%p56 bra $L__BB0_78;
mad.lo.s32 %r304, %r405, %r2, %r39;
mul.wide.s32 %rd100, %r304, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd102];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r405, 1;
setp.gt.u32 %p57, %r405, 3;
mov.u32 %r405, %r51;
@%p57 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r406, 0;
@%p46 bra $L__BB0_83;
setp.lt.u32 %p59, %r3, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p59 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
mov.b32 %r406, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@%p42 bra $L__BB0_85;
ld.shared.f32 %f260, [%rd24];
ld.shared.f32 %f261, [%rd23];
add.f32 %f262, %f260, %f261;
st.shared.f32 [%rd23], %f262;
$L__BB0_85:
bar.sync 0;
@%p43 bra $L__BB0_90;
mov.u32 %r407, %r415;
$L__BB0_87:
setp.ge.u32 %p62, %r7, %r407;
@%p62 bra $L__BB0_89;
mad.lo.s32 %r306, %r407, %r2, %r39;
mul.wide.s32 %rd103, %r306, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd105];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r407, 1;
setp.gt.u32 %p63, %r407, 3;
mov.u32 %r407, %r55;
@%p63 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r408, 0;
@%p46 bra $L__BB0_94;
setp.lt.u32 %p65, %r3, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p65 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
mov.b32 %r408, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@%p42 bra $L__BB0_96;
ld.shared.f32 %f268, [%rd24];
ld.shared.f32 %f269, [%rd23];
add.f32 %f270, %f268, %f269;
st.shared.f32 [%rd23], %f270;
$L__BB0_96:
bar.sync 0;
@%p43 bra $L__BB0_101;
mov.u32 %r409, %r415;
$L__BB0_98:
setp.ge.u32 %p68, %r7, %r409;
@%p68 bra $L__BB0_100;
mad.lo.s32 %r308, %r409, %r2, %r39;
mul.wide.s32 %rd106, %r308, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd108];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r409, 1;
setp.gt.u32 %p69, %r409, 3;
mov.u32 %r409, %r59;
@%p69 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r410, 0;
@%p46 bra $L__BB0_105;
setp.lt.u32 %p71, %r3, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p71 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
mov.b32 %r410, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@%p42 bra $L__BB0_107;
ld.shared.f32 %f276, [%rd24];
ld.shared.f32 %f277, [%rd23];
add.f32 %f278, %f276, %f277;
st.shared.f32 [%rd23], %f278;
$L__BB0_107:
bar.sync 0;
@%p43 bra $L__BB0_112;
mov.u32 %r411, %r415;
$L__BB0_109:
setp.ge.u32 %p74, %r7, %r411;
@%p74 bra $L__BB0_111;
mad.lo.s32 %r310, %r411, %r2, %r39;
mul.wide.s32 %rd109, %r310, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd111];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r411, 1;
setp.gt.u32 %p75, %r411, 3;
mov.u32 %r411, %r63;
@%p75 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r412, 0;
@%p46 bra $L__BB0_116;
setp.lt.u32 %p77, %r3, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p77 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
mov.b32 %r412, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p42 bra $L__BB0_118;
ld.shared.f32 %f284, [%rd24];
ld.shared.f32 %f285, [%rd23];
add.f32 %f286, %f284, %f285;
st.shared.f32 [%rd23], %f286;
$L__BB0_118:
bar.sync 0;
@%p43 bra $L__BB0_123;
mov.u32 %r413, %r415;
$L__BB0_120:
setp.ge.u32 %p80, %r7, %r413;
@%p80 bra $L__BB0_122;
mad.lo.s32 %r312, %r413, %r2, %r39;
mul.wide.s32 %rd112, %r312, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd114];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r413, 1;
setp.gt.u32 %p81, %r413, 3;
mov.u32 %r413, %r67;
@%p81 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r414, 0;
@%p46 bra $L__BB0_127;
setp.lt.u32 %p83, %r3, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p83 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
mov.b32 %r414, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p42 bra $L__BB0_129;
ld.shared.f32 %f292, [%rd24];
ld.shared.f32 %f293, [%rd23];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd23], %f294;
$L__BB0_129:
bar.sync 0;
@%p43 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p86, %r7, %r415;
@%p86 bra $L__BB0_132;
mad.lo.s32 %r314, %r415, %r2, %r39;
mul.wide.s32 %rd115, %r314, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd117];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r415, 1;
setp.gt.u32 %p87, %r415, 3;
mov.u32 %r415, %r71;
@%p87 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r416, 0;
@%p46 bra $L__BB0_137;
setp.lt.u32 %p89, %r3, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p89 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
mov.b32 %r416, %f377;
$L__BB0_137:
setp.eq.s32 %p138, %r7, 0;
and.pred %p137, %p138, %p1;
bar.sync 0;
@%p137 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
shl.b32 %r391, %r5, 2;
mov.u32 %r324, %ctaid.y;
mad.lo.s32 %r325, %r171, %r324, %r391;
mul.wide.s32 %rd120, %r325, 4;
add.s64 %rd118, %rd42, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd118], {%r402,%r404,%r406,%r408};
// end inline asm
add.s64 %rd119, %rd43, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r410,%r412,%r414,%r416};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r326, %r5, %r7;
or.b32 %r328, %r326, %r288;
setp.ne.s32 %p90, %r328, 0;
@%p90 bra $L__BB0_143;
ld.param.u64 %rd156, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd121, %rd156;
mov.u32 %r329, %ctaid.x;
mov.u32 %r330, %ctaid.z;
mov.u32 %r331, %nctaid.x;
mad.lo.s32 %r332, %r330, %r331, %r329;
mul.wide.s32 %rd122, %r332, 8;
add.s64 %rd28, %rd121, %rd122;
add.s32 %r333, %r8, -1;
setp.eq.s32 %p91, %r74, %r333;
cvt.s64.s32 %rd123, %r8;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p91;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p92, %rd128, 0;
@%p92 bra $L__BB0_143;
mov.u32 %r417, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r417;
// end inline asm
setp.lt.u32 %p93, %r417, 256;
selp.u32 %r336, 1, 0, %p93;
shl.b32 %r417, %r417, %r336;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p94, %rd130, -1;
@%p94 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
mov.u32 %r338, 1;
add.s32 %r339, %r171, 1;
shr.u32 %r340, %r339, 31;
add.s32 %r341, %r339, %r340;
shr.s32 %r342, %r341, 1;
add.s32 %r343, %r3, %r342;
add.s32 %r344, %r343, -1;
div.s32 %r345, %r344, %r3;
add.s32 %r346, %r8, -1;
add.s32 %r347, %r346, %r345;
div.s32 %r77, %r347, %r8;
add.s32 %r78, %r346, %r2;
shl.b32 %r79, %r7, 1;
shl.b32 %r348, %r3, 1;
mad.lo.s32 %r82, %r348, %r74, %r79;
or.b32 %r80, %r82, 1;
mul.lo.s32 %r81, %r348, %r8;
clz.b32 %r349, %r2;
mov.u32 %r350, 31;
sub.s32 %r351, %r350, %r349;
shl.b32 %r83, %r338, %r351;
setp.lt.u32 %p95, %r5, %r83;
add.s32 %r352, %r83, %r5;
setp.lt.u32 %p96, %r352, %r2;
and.pred %p6, %p95, %p96;
add.s32 %r353, %r39, %r83;
mul.wide.s32 %rd131, %r353, 4;
add.s64 %rd30, %rd45, %rd131;
shr.u32 %r354, %r83, 31;
add.s32 %r355, %r83, %r354;
shr.s32 %r84, %r355, 1;
add.s32 %r356, %r39, 1;
mul.wide.u32 %rd133, %r356, 4;
add.s64 %rd31, %rd45, %rd133;
mov.u32 %r418, 0;
not.pred %p122, %p6;
bra.uni $L__BB0_144;
$L__BB0_211:
add.s32 %r418, %r418, 1;
$L__BB0_144:
.pragma "nounroll";
setp.lt.s32 %p97, %r418, %r77;
@%p97 bra $L__BB0_180;
bra.uni $L__BB0_145;
$L__BB0_180:
div.s32 %r114, %r78, %r2;
setp.lt.s32 %p118, %r114, 1;
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p118 bra $L__BB0_186;
mul.lo.s32 %r375, %r81, %r418;
add.s32 %r115, %r80, %r375;
add.s32 %r116, %r82, %r375;
mov.u32 %r374, 0;
mov.f32 %f386, 0f00000000;
mov.u32 %r429, %r374;
$L__BB0_182:
.pragma "nounroll";
setp.ge.s32 %p119, %r115, %r171;
mov.u32 %r430, %r374;
mov.u32 %r431, %r374;
@%p119 bra $L__BB0_185;
mad.lo.s32 %r118, %r429, %r2, %r5;
setp.ge.s32 %p120, %r118, %r8;
mov.u32 %r430, %r374;
mov.u32 %r431, %r374;
@%p120 bra $L__BB0_185;
mad.lo.s32 %r382, %r118, %r171, %r116;
mul.wide.s32 %rd145, %r382, 4;
add.s64 %rd144, %rd43, %rd145;
// begin inline asm
ld.volatile.global.v2.s32 {%r431,%r430}, [%rd144];
// end inline asm
$L__BB0_185:
mov.b32 %f326, %r431;
add.f32 %f387, %f387, %f326;
mov.b32 %f327, %r430;
add.f32 %f386, %f386, %f327;
add.s32 %r429, %r429, 1;
setp.lt.s32 %p121, %r429, %r114;
@%p121 bra $L__BB0_182;
$L__BB0_186:
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@%p122 bra $L__BB0_188;
ld.shared.f32 %f328, [%rd30];
ld.shared.f32 %f329, [%rd23];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd23], %f330;
$L__BB0_188:
setp.lt.s32 %p123, %r83, 4;
bar.sync 0;
@%p123 bra $L__BB0_193;
mov.u32 %r432, %r84;
$L__BB0_190:
setp.ge.u32 %p124, %r5, %r432;
@%p124 bra $L__BB0_192;
add.s32 %r383, %r432, %r39;
mul.wide.s32 %rd146, %r383, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd148];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_192:
bar.sync 0;
shr.u32 %r125, %r432, 1;
setp.gt.u32 %p125, %r432, 3;
mov.u32 %r432, %r125;
@%p125 bra $L__BB0_190;
$L__BB0_193:
setp.ne.s32 %p126, %r5, 0;
mov.u32 %r433, 0;
@%p126 bra $L__BB0_197;
setp.lt.u32 %p127, %r2, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p127 bra $L__BB0_196;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_196:
mov.b32 %r433, %f388;
$L__BB0_197:
bar.sync 0;
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@%p122 bra $L__BB0_199;
ld.shared.f32 %f336, [%rd30];
ld.shared.f32 %f337, [%rd23];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd23], %f338;
$L__BB0_199:
bar.sync 0;
@%p123 bra $L__BB0_204;
mov.u32 %r434, %r84;
$L__BB0_201:
setp.ge.u32 %p130, %r5, %r434;
@%p130 bra $L__BB0_203;
add.s32 %r385, %r434, %r39;
mul.wide.s32 %rd149, %r385, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd151];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_203:
bar.sync 0;
shr.u32 %r129, %r434, 1;
setp.gt.u32 %p131, %r434, 3;
mov.u32 %r434, %r129;
@%p131 bra $L__BB0_201;
$L__BB0_204:
mov.u32 %r435, 0;
@%p126 bra $L__BB0_208;
setp.lt.u32 %p133, %r2, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p133 bra $L__BB0_207;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_207:
mov.b32 %r435, %f389;
$L__BB0_208:
bar.sync 0;
@%p126 bra $L__BB0_211;
mul.lo.s32 %r132, %r81, %r418;
add.s32 %r387, %r80, %r132;
setp.ge.s32 %p135, %r387, %r171;
@%p135 bra $L__BB0_211;
ld.param.u64 %rd155, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r390, %r82, %r132;
mul.wide.s32 %rd153, %r390, 4;
add.s64 %rd152, %rd155, %rd153;
// begin inline asm
st.global.cs.v2.s32 [%rd152], {%r433,%r435};
// end inline asm
bra.uni $L__BB0_211;
$L__BB0_145:
setp.lt.s32 %p98, %r77, 1;
@%p98 bra $L__BB0_179;
div.s32 %r86, %r78, %r2;
mad.lo.s32 %r87, %r171, %r5, %r79;
shl.b32 %r88, %r74, 1;
shl.b32 %r89, %r8, 1;
mul.lo.s32 %r90, %r171, %r2;
mov.u32 %r419, 0;
$L__BB0_147:
.pragma "nounroll";
setp.lt.s32 %p99, %r86, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p99 bra $L__BB0_153;
mad.lo.s32 %r92, %r81, %r419, %r80;
mad.lo.s32 %r359, %r89, %r419, %r88;
mad.lo.s32 %r421, %r3, %r359, %r87;
mov.u32 %r358, 0;
mov.f32 %f380, 0f00000000;
mov.u32 %r420, %r5;
mov.u32 %r422, %r358;
$L__BB0_149:
.pragma "nounroll";
setp.ge.s32 %p100, %r92, %r171;
mov.u32 %r423, %r358;
mov.u32 %r424, %r358;
@%p100 bra $L__BB0_152;
setp.ge.s32 %p101, %r420, %r8;
mov.u32 %r423, %r358;
mov.u32 %r424, %r358;
@%p101 bra $L__BB0_152;
mul.wide.s32 %rd135, %r421, 4;
add.s64 %rd134, %rd42, %rd135;
// begin inline asm
ld.volatile.global.v2.s32 {%r424,%r423}, [%rd134];
// end inline asm
$L__BB0_152:
mov.b32 %f304, %r424;
add.f32 %f381, %f381, %f304;
mov.b32 %f305, %r423;
add.f32 %f380, %f380, %f305;
add.s32 %r421, %r421, %r90;
add.s32 %r420, %r420, %r2;
add.s32 %r422, %r422, 1;
setp.lt.s32 %p102, %r422, %r86;
@%p102 bra $L__BB0_149;
$L__BB0_153:
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p122 bra $L__BB0_155;
ld.shared.f32 %f306, [%rd30];
ld.shared.f32 %f307, [%rd23];
add.f32 %f308, %f306, %f307;
st.shared.f32 [%rd23], %f308;
$L__BB0_155:
setp.lt.s32 %p104, %r83, 4;
bar.sync 0;
@%p104 bra $L__BB0_160;
mov.u32 %r425, %r84;
$L__BB0_157:
setp.ge.u32 %p105, %r5, %r425;
@%p105 bra $L__BB0_159;
add.s32 %r366, %r425, %r39;
mul.wide.s32 %rd136, %r366, 4;
add.s64 %rd138, %rd45, %rd136;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd138];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_159:
bar.sync 0;
shr.u32 %r105, %r425, 1;
setp.gt.u32 %p106, %r425, 3;
mov.u32 %r425, %r105;
@%p106 bra $L__BB0_157;
$L__BB0_160:
setp.ne.s32 %p107, %r5, 0;
mov.u32 %r426, 0;
@%p107 bra $L__BB0_164;
setp.lt.u32 %p108, %r2, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p108 bra $L__BB0_163;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_163:
mov.b32 %r426, %f382;
$L__BB0_164:
bar.sync 0;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
@%p122 bra $L__BB0_166;
ld.shared.f32 %f314, [%rd30];
ld.shared.f32 %f315, [%rd23];
add.f32 %f316, %f314, %f315;
st.shared.f32 [%rd23], %f316;
$L__BB0_166:
bar.sync 0;
@%p104 bra $L__BB0_171;
mov.u32 %r427, %r84;
$L__BB0_168:
setp.ge.u32 %p111, %r5, %r427;
@%p111 bra $L__BB0_170;
add.s32 %r368, %r427, %r39;
mul.wide.s32 %rd139, %r368, 4;
add.s64 %rd141, %rd45, %rd139;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd141];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_170:
bar.sync 0;
shr.u32 %r109, %r427, 1;
setp.gt.u32 %p112, %r427, 3;
mov.u32 %r427, %r109;
@%p112 bra $L__BB0_168;
$L__BB0_171:
mov.u32 %r428, 0;
@%p107 bra $L__BB0_175;
setp.lt.u32 %p114, %r2, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p114 bra $L__BB0_174;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_174:
mov.b32 %r428, %f383;
$L__BB0_175:
bar.sync 0;
@%p107 bra $L__BB0_178;
mul.lo.s32 %r112, %r81, %r419;
add.s32 %r370, %r80, %r112;
setp.ge.s32 %p116, %r370, %r171;
@%p116 bra $L__BB0_178;
ld.param.u64 %rd154, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_7b5a152a_1033910nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r373, %r82, %r112;
mul.wide.s32 %rd143, %r373, 4;
add.s64 %rd142, %rd154, %rd143;
// begin inline asm
st.global.cs.v2.s32 [%rd142], {%r426,%r428};
// end inline asm
$L__BB0_178:
add.s32 %r419, %r419, 1;
setp.lt.s32 %p117, %r419, %r77;
@%p117 bra $L__BB0_147;
$L__BB0_179:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<140>;
.reg .f32 %f<390>;
.reg .b32 %r<434>;
.reg .f64 %fd<3>;
.reg .b64 %rd<157>;
ld.param.v2.u32 {%r170, %r171}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r180, %r181}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r184, %r185}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r206, %r171, 3;
shr.s32 %r207, %r206, 31;
shr.u32 %r208, %r207, 30;
add.s32 %r209, %r206, %r208;
shr.s32 %r2, %r209, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r210, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r211, %r4, 2;
mad.lo.s32 %r212, %r211, %r210, 15;
and.b32 %r213, %r212, -16;
cvt.u64.u32 %rd1, %r213;
mul.lo.s32 %r214, %r4, %r2;
shl.b32 %r215, %r214, 4;
or.b32 %r216, %r215, 15;
and.b32 %r5, %r216, -16;
add.s32 %r217, %r216, %r5;
and.b32 %r218, %r217, -16;
cvt.s64.s32 %rd2, %r218;
mov.u64 %rd45, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_72335arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p7, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r219, %r7, 3;
setp.lt.s32 %p8, %r219, %r171;
and.pred %p1, %p8, %p7;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p9, %r8, 0;
and.pred %p2, %p9, %p1;
not.pred %p10, %p2;
@%p10 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r220, smem_ptr; }
// end inline asm
shl.b32 %r223, %r6, 4;
add.s32 %r221, %r220, %r223;
mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd38, %rd49;
mov.u32 %r222, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r222, 0;
cp.async.ca.shared.global [%r221], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r224, %r4, 215;
div.s32 %r225, %r224, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r226, %r9, %r225;
add.s32 %r227, %r226, -1;
div.s32 %r10, %r227, %r9;
setp.gt.s32 %p11, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p11 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r171;
cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r229, %ctaid.y;
mul.lo.s32 %r230, %r10, %r4;
mul.lo.s32 %r11, %r230, %r229;
mad.lo.s32 %r231, %r2, %r8, %r6;
shl.b32 %r12, %r231, 4;
mul.lo.s32 %r232, %r171, %r8;
cvt.s64.s32 %rd54, %r232;
cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r233, %r11, %r171;
cvt.s64.s32 %rd6, %r233;
mul.lo.s32 %r13, %r171, %r4;
mul.lo.s32 %r14, %r10, %r229;
shl.b32 %r234, %r8, 2;
mad.lo.s32 %r235, %r234, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r235, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r236, %tid.z;
mad.lo.s32 %r237, %r4, %r236, %r8;
mad.lo.s32 %r15, %r237, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r238, %r3;
mov.u32 %r239, 31;
sub.s32 %r240, %r239, %r238;
mov.u32 %r241, 1;
shl.b32 %r16, %r241, %r240;
setp.lt.u32 %p12, %r6, %r16;
add.s32 %r242, %r16, %r6;
setp.lt.u32 %p13, %r242, %r3;
and.pred %p3, %p12, %p13;
add.s32 %r243, %r15, %r16;
mul.wide.s32 %rd59, %r243, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r244, %r16, 31;
add.s32 %r245, %r16, %r244;
shr.s32 %r17, %r245, 1;
add.s64 %rd10, %rd53, %rd57;
add.s32 %r246, %r15, 1;
mul.wide.u32 %rd60, %r246, 4;
add.s64 %rd11, %rd45, %rd60;
add.s64 %rd61, %rd45, %rd4;
mul.wide.s32 %rd62, %r7, 4;
add.s64 %rd12, %rd61, %rd62;
mul.wide.s32 %rd63, %r237, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd37;
add.s64 %rd19, %rd46, %rd51;
mov.u32 %r392, 0;
mov.f32 %f354, 0f00000000;
not.pred %p14, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r249, smem_ptr; }
// end inline asm
add.s32 %r250, %r249, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r275, smem_ptr; }
// end inline asm
add.s32 %r276, %r275, %r12;
not.pred %p24, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_5:
.pragma "nounroll";
@%p14 bra $L__BB0_8;
mad.lo.s32 %r247, %r392, %r4, %r8;
add.s32 %r248, %r247, %r11;
setp.gt.s32 %p15, %r248, 215;
@%p15 bra $L__BB0_8;
mul.lo.s32 %r252, %r13, %r392;
cvt.s64.s32 %rd67, %r252;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd34, %rd70;
mov.u32 %r251, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r251, 0;
cp.async.ca.shared.global [%r250], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p14 bra $L__BB0_10;
add.s32 %r253, %r14, %r392;
mad.lo.s32 %r254, %r253, %r4, %r8;
setp.lt.s32 %p17, %r254, 216;
@%p17 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r393, %r394, %r395, %r396}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r263, %r14, %r392;
mad.lo.s32 %r264, %r263, %r4, %r8;
setp.gt.s32 %p18, %r264, 215;
mov.u32 %r393, 0;
mov.u32 %r394, %r393;
mov.u32 %r395, %r393;
mov.u32 %r396, %r393;
@%p18 bra $L__BB0_15;
ld.shared.v4.u32 {%r393, %r394, %r395, %r396}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r393, 0;
mov.u32 %r394, %r393;
mov.u32 %r395, %r393;
mov.u32 %r396, %r393;
$L__BB0_15:
add.s32 %r273, %r14, %r392;
mad.lo.s32 %r33, %r273, %r4, %r8;
mov.b32 %f112, %r396;
add.f32 %f369, %f369, %f112;
mov.b32 %f113, %r395;
add.f32 %f368, %f368, %f113;
mov.b32 %f114, %r394;
add.f32 %f367, %f367, %f114;
mov.b32 %f115, %r393;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p19, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p19 bra $L__BB0_17;
mul.lo.s32 %r274, %r33, %r180;
mul.wide.s32 %rd71, %r274, 4;
add.s64 %rd72, %rd16, %rd71;
ld.global.f32 %f352, [%rd72];
$L__BB0_17:
setp.lt.s32 %p20, %r33, 216;
and.pred %p4, %p1, %p20;
not.pred %p21, %p4;
@%p21 bra $L__BB0_19;
mul.lo.s32 %r278, %r13, %r392;
cvt.s64.s32 %rd75, %r278;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd35, %rd78;
mov.u32 %r277, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r277, 0;
cp.async.ca.shared.global [%r276], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r391, %r14, %r392;
mad.lo.s32 %r390, %r391, %r4, %r8;
setp.gt.s32 %p139, %r390, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p139 bra $L__BB0_21;
mul.lo.s32 %r279, %r33, %r184;
mul.wide.s32 %rd79, %r279, 4;
add.s64 %rd80, %rd17, %rd79;
ld.global.f32 %f353, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f359, %f358;
@%p21 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd7];
mul.f32 %f129, %f119, %f124;
add.f32 %f130, %f129, 0f00000000;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
sub.f32 %f136, %f131, %f352;
mul.f32 %f137, %f353, %f136;
fma.rn.f32 %f138, %f129, %f137, 0f00000000;
fma.rn.f32 %f354, %f137, %f124, %f354;
mul.f32 %f141, %f120, %f125;
add.f32 %f142, %f130, %f141;
sub.f32 %f144, %f132, %f352;
mul.f32 %f145, %f353, %f144;
fma.rn.f32 %f146, %f141, %f145, %f138;
fma.rn.f32 %f355, %f145, %f125, %f355;
mul.f32 %f149, %f121, %f126;
add.f32 %f150, %f142, %f149;
sub.f32 %f152, %f133, %f352;
mul.f32 %f153, %f353, %f152;
fma.rn.f32 %f154, %f149, %f153, %f146;
fma.rn.f32 %f356, %f153, %f126, %f356;
mul.f32 %f157, %f122, %f127;
add.f32 %f359, %f150, %f157;
sub.f32 %f159, %f134, %f352;
mul.f32 %f160, %f353, %f159;
fma.rn.f32 %f358, %f157, %f160, %f154;
fma.rn.f32 %f357, %f160, %f127, %f357;
$L__BB0_23:
st.shared.f32 [%rd8], %f359;
bar.sync 0;
@%p24 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_30;
mov.u32 %r397, %r17;
$L__BB0_27:
setp.ge.u32 %p26, %r6, %r397;
@%p26 bra $L__BB0_29;
add.s32 %r280, %r397, %r15;
mul.wide.s32 %rd81, %r280, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd83];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r397, 1;
setp.gt.u32 %p27, %r397, 3;
mov.u32 %r397, %r35;
@%p27 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p28, %r6, 0;
mov.f32 %f360, 0f00000000;
@%p28 bra $L__BB0_33;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p29 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f360, %f360, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f358;
bar.sync 0;
@%p24 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p136, %r16, 4;
bar.sync 0;
@%p136 bra $L__BB0_40;
mov.u32 %r398, %r17;
$L__BB0_37:
setp.ge.u32 %p32, %r6, %r398;
@%p32 bra $L__BB0_39;
add.s32 %r281, %r398, %r15;
mul.wide.s32 %rd84, %r281, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd86];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r398, 1;
setp.gt.u32 %p33, %r398, 3;
mov.u32 %r398, %r37;
@%p33 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p28 bra $L__BB0_43;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p35 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f361, %f361, %f178;
$L__BB0_43:
bar.sync 0;
@%p28 bra $L__BB0_45;
st.shared.f32 [%rd13], %f360;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p28 bra $L__BB0_47;
st.shared.f32 [%rd13], %f361;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p21 bra $L__BB0_49;
mul.f32 %f179, %f353, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd12];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd10];
sub.f32 %f197, %f192, %f352;
mul.f32 %f198, %f353, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r282, %f202;
mul.f32 %f205, %f181, %f186;
mul.f32 %f206, %f205, %f2;
sub.f32 %f208, %f193, %f352;
mul.f32 %f209, %f353, %f208;
sub.f32 %f210, %f206, %f37;
mul.f32 %f211, %f38, %f209;
sub.f32 %f212, %f210, %f211;
mul.f32 %f213, %f179, %f212;
mov.b32 %r283, %f213;
mul.f32 %f216, %f182, %f187;
mul.f32 %f217, %f216, %f2;
sub.f32 %f219, %f194, %f352;
mul.f32 %f220, %f353, %f219;
sub.f32 %f221, %f217, %f37;
mul.f32 %f222, %f38, %f220;
sub.f32 %f223, %f221, %f222;
mul.f32 %f224, %f179, %f223;
mov.b32 %r284, %f224;
mul.f32 %f227, %f183, %f188;
mul.f32 %f228, %f227, %f2;
sub.f32 %f230, %f195, %f352;
mul.f32 %f231, %f353, %f230;
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r285, %f235;
mad.lo.s32 %r286, %r33, %r171, %r7;
mul.wide.s32 %rd88, %r286, 4;
add.s64 %rd87, %rd39, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r282,%r283,%r284,%r285};
// end inline asm
$L__BB0_49:
add.s32 %r392, %r392, 1;
setp.lt.s32 %p39, %r392, %r10;
@%p39 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
mov.u32 %r287, %tid.z;
mad.lo.s32 %r288, %r4, %r287, %r8;
mad.lo.s32 %r39, %r288, %r3, %r6;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
clz.b32 %r289, %r4;
mov.u32 %r290, 31;
sub.s32 %r291, %r290, %r289;
mov.u32 %r292, 1;
shl.b32 %r40, %r292, %r291;
setp.lt.u32 %p40, %r8, %r40;
add.s32 %r293, %r40, %r8;
setp.lt.u32 %p41, %r293, %r4;
and.pred %p5, %p40, %p41;
shl.b32 %r294, %r3, %r291;
add.s32 %r295, %r39, %r294;
mul.wide.s32 %rd91, %r295, 4;
add.s64 %rd24, %rd45, %rd91;
shr.u32 %r296, %r40, 31;
add.s32 %r297, %r40, %r296;
shr.s32 %r413, %r297, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p42, %p5;
@%p42 bra $L__BB0_52;
ld.shared.f32 %f236, [%rd24];
ld.shared.f32 %f237, [%rd23];
add.f32 %f238, %f236, %f237;
st.shared.f32 [%rd23], %f238;
$L__BB0_52:
setp.lt.s32 %p43, %r40, 4;
bar.sync 0;
@%p43 bra $L__BB0_57;
mov.u32 %r399, %r413;
$L__BB0_54:
setp.ge.u32 %p44, %r8, %r399;
@%p44 bra $L__BB0_56;
mad.lo.s32 %r298, %r399, %r3, %r39;
mul.wide.s32 %rd92, %r298, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd94];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r399, 1;
setp.gt.u32 %p45, %r399, 3;
mov.u32 %r399, %r43;
@%p45 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r300, %r39, %r3;
mul.wide.u32 %rd95, %r300, 4;
add.s64 %rd25, %rd45, %rd95;
setp.ne.s32 %p46, %r8, 0;
mov.u32 %r400, 0;
@%p46 bra $L__BB0_61;
setp.lt.u32 %p47, %r4, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p47 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
mov.b32 %r400, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@%p42 bra $L__BB0_63;
ld.shared.f32 %f244, [%rd24];
ld.shared.f32 %f245, [%rd23];
add.f32 %f246, %f244, %f245;
st.shared.f32 [%rd23], %f246;
$L__BB0_63:
bar.sync 0;
@%p43 bra $L__BB0_68;
mov.u32 %r401, %r413;
$L__BB0_65:
setp.ge.u32 %p50, %r8, %r401;
@%p50 bra $L__BB0_67;
mad.lo.s32 %r301, %r401, %r3, %r39;
mul.wide.s32 %rd97, %r301, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd99];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r401, 1;
setp.gt.u32 %p51, %r401, 3;
mov.u32 %r401, %r47;
@%p51 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r402, 0;
@%p46 bra $L__BB0_72;
setp.lt.u32 %p53, %r4, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p53 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
mov.b32 %r402, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@%p42 bra $L__BB0_74;
ld.shared.f32 %f252, [%rd24];
ld.shared.f32 %f253, [%rd23];
add.f32 %f254, %f252, %f253;
st.shared.f32 [%rd23], %f254;
$L__BB0_74:
bar.sync 0;
@%p43 bra $L__BB0_79;
mov.u32 %r403, %r413;
$L__BB0_76:
setp.ge.u32 %p56, %r8, %r403;
@%p56 bra $L__BB0_78;
mad.lo.s32 %r303, %r403, %r3, %r39;
mul.wide.s32 %rd100, %r303, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd102];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r403, 1;
setp.gt.u32 %p57, %r403, 3;
mov.u32 %r403, %r51;
@%p57 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r404, 0;
@%p46 bra $L__BB0_83;
setp.lt.u32 %p59, %r4, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p59 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
mov.b32 %r404, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@%p42 bra $L__BB0_85;
ld.shared.f32 %f260, [%rd24];
ld.shared.f32 %f261, [%rd23];
add.f32 %f262, %f260, %f261;
st.shared.f32 [%rd23], %f262;
$L__BB0_85:
bar.sync 0;
@%p43 bra $L__BB0_90;
mov.u32 %r405, %r413;
$L__BB0_87:
setp.ge.u32 %p62, %r8, %r405;
@%p62 bra $L__BB0_89;
mad.lo.s32 %r305, %r405, %r3, %r39;
mul.wide.s32 %rd103, %r305, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd105];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r405, 1;
setp.gt.u32 %p63, %r405, 3;
mov.u32 %r405, %r55;
@%p63 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r406, 0;
@%p46 bra $L__BB0_94;
setp.lt.u32 %p65, %r4, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p65 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
mov.b32 %r406, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@%p42 bra $L__BB0_96;
ld.shared.f32 %f268, [%rd24];
ld.shared.f32 %f269, [%rd23];
add.f32 %f270, %f268, %f269;
st.shared.f32 [%rd23], %f270;
$L__BB0_96:
bar.sync 0;
@%p43 bra $L__BB0_101;
mov.u32 %r407, %r413;
$L__BB0_98:
setp.ge.u32 %p68, %r8, %r407;
@%p68 bra $L__BB0_100;
mad.lo.s32 %r307, %r407, %r3, %r39;
mul.wide.s32 %rd106, %r307, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd108];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r407, 1;
setp.gt.u32 %p69, %r407, 3;
mov.u32 %r407, %r59;
@%p69 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r408, 0;
@%p46 bra $L__BB0_105;
setp.lt.u32 %p71, %r4, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p71 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
mov.b32 %r408, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@%p42 bra $L__BB0_107;
ld.shared.f32 %f276, [%rd24];
ld.shared.f32 %f277, [%rd23];
add.f32 %f278, %f276, %f277;
st.shared.f32 [%rd23], %f278;
$L__BB0_107:
bar.sync 0;
@%p43 bra $L__BB0_112;
mov.u32 %r409, %r413;
$L__BB0_109:
setp.ge.u32 %p74, %r8, %r409;
@%p74 bra $L__BB0_111;
mad.lo.s32 %r309, %r409, %r3, %r39;
mul.wide.s32 %rd109, %r309, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd111];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r409, 1;
setp.gt.u32 %p75, %r409, 3;
mov.u32 %r409, %r63;
@%p75 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r410, 0;
@%p46 bra $L__BB0_116;
setp.lt.u32 %p77, %r4, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p77 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
mov.b32 %r410, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p42 bra $L__BB0_118;
ld.shared.f32 %f284, [%rd24];
ld.shared.f32 %f285, [%rd23];
add.f32 %f286, %f284, %f285;
st.shared.f32 [%rd23], %f286;
$L__BB0_118:
bar.sync 0;
@%p43 bra $L__BB0_123;
mov.u32 %r411, %r413;
$L__BB0_120:
setp.ge.u32 %p80, %r8, %r411;
@%p80 bra $L__BB0_122;
mad.lo.s32 %r311, %r411, %r3, %r39;
mul.wide.s32 %rd112, %r311, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd114];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r411, 1;
setp.gt.u32 %p81, %r411, 3;
mov.u32 %r411, %r67;
@%p81 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r412, 0;
@%p46 bra $L__BB0_127;
setp.lt.u32 %p83, %r4, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p83 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
mov.b32 %r412, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p42 bra $L__BB0_129;
ld.shared.f32 %f292, [%rd24];
ld.shared.f32 %f293, [%rd23];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd23], %f294;
$L__BB0_129:
bar.sync 0;
@%p43 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p86, %r8, %r413;
@%p86 bra $L__BB0_132;
mad.lo.s32 %r313, %r413, %r3, %r39;
mul.wide.s32 %rd115, %r313, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd117];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r413, 1;
setp.gt.u32 %p87, %r413, 3;
mov.u32 %r413, %r71;
@%p87 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r414, 0;
@%p46 bra $L__BB0_137;
setp.lt.u32 %p89, %r4, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p89 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
mov.b32 %r414, %f377;
$L__BB0_137:
setp.eq.s32 %p138, %r8, 0;
and.pred %p137, %p138, %p1;
bar.sync 0;
@%p137 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
mov.u32 %r323, %ctaid.y;
mad.lo.s32 %r324, %r171, %r323, %r7;
mul.wide.s32 %rd120, %r324, 4;
add.s64 %rd118, %rd42, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd118], {%r400,%r402,%r404,%r406};
// end inline asm
add.s64 %rd119, %rd43, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r408,%r410,%r412,%r414};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r325, %r6, %r8;
or.b32 %r327, %r325, %r287;
setp.ne.s32 %p90, %r327, 0;
@%p90 bra $L__BB0_143;
ld.param.u64 %rd156, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd121, %rd156;
mov.u32 %r328, %ctaid.x;
mov.u32 %r329, %ctaid.z;
mov.u32 %r330, %nctaid.x;
mad.lo.s32 %r331, %r329, %r330, %r328;
mul.wide.s32 %rd122, %r331, 8;
add.s64 %rd28, %rd121, %rd122;
add.s32 %r332, %r9, -1;
setp.eq.s32 %p91, %r74, %r332;
cvt.s64.s32 %rd123, %r9;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p91;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p92, %rd128, 0;
@%p92 bra $L__BB0_143;
mov.u32 %r415, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r415;
// end inline asm
setp.lt.u32 %p93, %r415, 256;
selp.u32 %r335, 1, 0, %p93;
shl.b32 %r415, %r415, %r335;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p94, %rd130, -1;
@%p94 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
mov.u32 %r337, 1;
add.s32 %r338, %r171, 1;
shr.u32 %r339, %r338, 31;
add.s32 %r340, %r338, %r339;
shr.s32 %r341, %r340, 1;
add.s32 %r342, %r4, %r341;
add.s32 %r343, %r342, -1;
div.s32 %r344, %r343, %r4;
add.s32 %r345, %r9, -1;
add.s32 %r346, %r345, %r344;
div.s32 %r77, %r346, %r9;
add.s32 %r78, %r345, %r3;
shl.b32 %r79, %r8, 1;
shl.b32 %r347, %r4, 1;
mad.lo.s32 %r82, %r347, %r74, %r79;
or.b32 %r80, %r82, 1;
mul.lo.s32 %r81, %r347, %r9;
clz.b32 %r348, %r3;
mov.u32 %r349, 31;
sub.s32 %r350, %r349, %r348;
shl.b32 %r83, %r337, %r350;
setp.lt.u32 %p95, %r6, %r83;
add.s32 %r351, %r83, %r6;
setp.lt.u32 %p96, %r351, %r3;
and.pred %p6, %p95, %p96;
add.s32 %r352, %r39, %r83;
mul.wide.s32 %rd131, %r352, 4;
add.s64 %rd30, %rd45, %rd131;
shr.u32 %r353, %r83, 31;
add.s32 %r354, %r83, %r353;
shr.s32 %r84, %r354, 1;
add.s32 %r355, %r39, 1;
mul.wide.u32 %rd133, %r355, 4;
add.s64 %rd31, %rd45, %rd133;
mov.u32 %r416, 0;
not.pred %p122, %p6;
bra.uni $L__BB0_144;
$L__BB0_211:
add.s32 %r416, %r416, 1;
$L__BB0_144:
.pragma "nounroll";
setp.lt.s32 %p97, %r416, %r77;
@%p97 bra $L__BB0_180;
bra.uni $L__BB0_145;
$L__BB0_180:
div.s32 %r114, %r78, %r3;
setp.lt.s32 %p118, %r114, 1;
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p118 bra $L__BB0_186;
mul.lo.s32 %r374, %r81, %r416;
add.s32 %r115, %r80, %r374;
add.s32 %r116, %r82, %r374;
mov.u32 %r373, 0;
mov.f32 %f386, 0f00000000;
mov.u32 %r427, %r373;
$L__BB0_182:
.pragma "nounroll";
setp.ge.s32 %p119, %r115, %r171;
mov.u32 %r428, %r373;
mov.u32 %r429, %r373;
@%p119 bra $L__BB0_185;
mad.lo.s32 %r118, %r427, %r3, %r6;
setp.ge.s32 %p120, %r118, %r9;
mov.u32 %r428, %r373;
mov.u32 %r429, %r373;
@%p120 bra $L__BB0_185;
mad.lo.s32 %r381, %r118, %r171, %r116;
mul.wide.s32 %rd145, %r381, 4;
add.s64 %rd144, %rd43, %rd145;
// begin inline asm
ld.volatile.global.v2.s32 {%r429,%r428}, [%rd144];
// end inline asm
$L__BB0_185:
mov.b32 %f326, %r429;
add.f32 %f387, %f387, %f326;
mov.b32 %f327, %r428;
add.f32 %f386, %f386, %f327;
add.s32 %r427, %r427, 1;
setp.lt.s32 %p121, %r427, %r114;
@%p121 bra $L__BB0_182;
$L__BB0_186:
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@%p122 bra $L__BB0_188;
ld.shared.f32 %f328, [%rd30];
ld.shared.f32 %f329, [%rd23];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd23], %f330;
$L__BB0_188:
setp.lt.s32 %p123, %r83, 4;
bar.sync 0;
@%p123 bra $L__BB0_193;
mov.u32 %r430, %r84;
$L__BB0_190:
setp.ge.u32 %p124, %r6, %r430;
@%p124 bra $L__BB0_192;
add.s32 %r382, %r430, %r39;
mul.wide.s32 %rd146, %r382, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd148];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_192:
bar.sync 0;
shr.u32 %r125, %r430, 1;
setp.gt.u32 %p125, %r430, 3;
mov.u32 %r430, %r125;
@%p125 bra $L__BB0_190;
$L__BB0_193:
setp.ne.s32 %p126, %r6, 0;
mov.u32 %r431, 0;
@%p126 bra $L__BB0_197;
setp.lt.u32 %p127, %r3, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p127 bra $L__BB0_196;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_196:
mov.b32 %r431, %f388;
$L__BB0_197:
bar.sync 0;
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@%p122 bra $L__BB0_199;
ld.shared.f32 %f336, [%rd30];
ld.shared.f32 %f337, [%rd23];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd23], %f338;
$L__BB0_199:
bar.sync 0;
@%p123 bra $L__BB0_204;
mov.u32 %r432, %r84;
$L__BB0_201:
setp.ge.u32 %p130, %r6, %r432;
@%p130 bra $L__BB0_203;
add.s32 %r384, %r432, %r39;
mul.wide.s32 %rd149, %r384, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd151];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_203:
bar.sync 0;
shr.u32 %r129, %r432, 1;
setp.gt.u32 %p131, %r432, 3;
mov.u32 %r432, %r129;
@%p131 bra $L__BB0_201;
$L__BB0_204:
mov.u32 %r433, 0;
@%p126 bra $L__BB0_208;
setp.lt.u32 %p133, %r3, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p133 bra $L__BB0_207;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_207:
mov.b32 %r433, %f389;
$L__BB0_208:
bar.sync 0;
@%p126 bra $L__BB0_211;
mul.lo.s32 %r132, %r81, %r416;
add.s32 %r386, %r80, %r132;
setp.ge.s32 %p135, %r386, %r171;
@%p135 bra $L__BB0_211;
ld.param.u64 %rd155, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r389, %r82, %r132;
mul.wide.s32 %rd153, %r389, 4;
add.s64 %rd152, %rd155, %rd153;
// begin inline asm
st.global.cs.v2.s32 [%rd152], {%r431,%r433};
// end inline asm
bra.uni $L__BB0_211;
$L__BB0_145:
setp.lt.s32 %p98, %r77, 1;
@%p98 bra $L__BB0_179;
div.s32 %r86, %r78, %r3;
mad.lo.s32 %r87, %r171, %r6, %r79;
shl.b32 %r88, %r74, 1;
shl.b32 %r89, %r9, 1;
mul.lo.s32 %r90, %r171, %r3;
mov.u32 %r417, 0;
$L__BB0_147:
.pragma "nounroll";
setp.lt.s32 %p99, %r86, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p99 bra $L__BB0_153;
mad.lo.s32 %r92, %r81, %r417, %r80;
mad.lo.s32 %r358, %r89, %r417, %r88;
mad.lo.s32 %r419, %r4, %r358, %r87;
mov.u32 %r357, 0;
mov.f32 %f380, 0f00000000;
mov.u32 %r418, %r6;
mov.u32 %r420, %r357;
$L__BB0_149:
.pragma "nounroll";
setp.ge.s32 %p100, %r92, %r171;
mov.u32 %r421, %r357;
mov.u32 %r422, %r357;
@%p100 bra $L__BB0_152;
setp.ge.s32 %p101, %r418, %r9;
mov.u32 %r421, %r357;
mov.u32 %r422, %r357;
@%p101 bra $L__BB0_152;
mul.wide.s32 %rd135, %r419, 4;
add.s64 %rd134, %rd42, %rd135;
// begin inline asm
ld.volatile.global.v2.s32 {%r422,%r421}, [%rd134];
// end inline asm
$L__BB0_152:
mov.b32 %f304, %r422;
add.f32 %f381, %f381, %f304;
mov.b32 %f305, %r421;
add.f32 %f380, %f380, %f305;
add.s32 %r419, %r419, %r90;
add.s32 %r418, %r418, %r3;
add.s32 %r420, %r420, 1;
setp.lt.s32 %p102, %r420, %r86;
@%p102 bra $L__BB0_149;
$L__BB0_153:
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p122 bra $L__BB0_155;
ld.shared.f32 %f306, [%rd30];
ld.shared.f32 %f307, [%rd23];
add.f32 %f308, %f306, %f307;
st.shared.f32 [%rd23], %f308;
$L__BB0_155:
setp.lt.s32 %p104, %r83, 4;
bar.sync 0;
@%p104 bra $L__BB0_160;
mov.u32 %r423, %r84;
$L__BB0_157:
setp.ge.u32 %p105, %r6, %r423;
@%p105 bra $L__BB0_159;
add.s32 %r365, %r423, %r39;
mul.wide.s32 %rd136, %r365, 4;
add.s64 %rd138, %rd45, %rd136;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd138];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_159:
bar.sync 0;
shr.u32 %r105, %r423, 1;
setp.gt.u32 %p106, %r423, 3;
mov.u32 %r423, %r105;
@%p106 bra $L__BB0_157;
$L__BB0_160:
setp.ne.s32 %p107, %r6, 0;
mov.u32 %r424, 0;
@%p107 bra $L__BB0_164;
setp.lt.u32 %p108, %r3, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p108 bra $L__BB0_163;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_163:
mov.b32 %r424, %f382;
$L__BB0_164:
bar.sync 0;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
@%p122 bra $L__BB0_166;
ld.shared.f32 %f314, [%rd30];
ld.shared.f32 %f315, [%rd23];
add.f32 %f316, %f314, %f315;
st.shared.f32 [%rd23], %f316;
$L__BB0_166:
bar.sync 0;
@%p104 bra $L__BB0_171;
mov.u32 %r425, %r84;
$L__BB0_168:
setp.ge.u32 %p111, %r6, %r425;
@%p111 bra $L__BB0_170;
add.s32 %r367, %r425, %r39;
mul.wide.s32 %rd139, %r367, 4;
add.s64 %rd141, %rd45, %rd139;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd141];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_170:
bar.sync 0;
shr.u32 %r109, %r425, 1;
setp.gt.u32 %p112, %r425, 3;
mov.u32 %r425, %r109;
@%p112 bra $L__BB0_168;
$L__BB0_171:
mov.u32 %r426, 0;
@%p107 bra $L__BB0_175;
setp.lt.u32 %p114, %r3, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p114 bra $L__BB0_174;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_174:
mov.b32 %r426, %f383;
$L__BB0_175:
bar.sync 0;
@%p107 bra $L__BB0_178;
mul.lo.s32 %r112, %r81, %r417;
add.s32 %r369, %r80, %r112;
setp.ge.s32 %p116, %r369, %r171;
@%p116 bra $L__BB0_178;
ld.param.u64 %rd154, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_17_cu_aab2bf46_723310nvfuser_17ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r372, %r82, %r112;
mul.wide.s32 %rd143, %r372, 4;
add.s64 %rd142, %rd154, %rd143;
// begin inline asm
st.global.cs.v2.s32 [%rd142], {%r424,%r426};
// end inline asm
$L__BB0_178:
add.s32 %r417, %r417, 1;
setp.lt.s32 %p117, %r417, %r77;
@%p117 bra $L__BB0_147;
$L__BB0_179:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -30,11 +30,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<140>;
.reg .f32 %f<390>;
- .reg .b32 %r<436>;
+ .reg .b32 %r<434>;
.reg .f64 %fd<3>;
.reg .b64 %rd<157>;
ld.param.v2.u32 {%r170, %r171}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
@@ -50,110 +50,110 @@
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r206, %r171, 3;
shr.s32 %r207, %r206, 31;
shr.u32 %r208, %r207, 30;
add.s32 %r209, %r206, %r208;
- shr.s32 %r210, %r209, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r211, %r210, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r212, %r3, 2;
- mad.lo.s32 %r213, %r212, %r211, 15;
- and.b32 %r214, %r213, -16;
- cvt.u64.u32 %rd1, %r214;
- mul.lo.s32 %r215, %r3, %r210;
- shl.b32 %r216, %r215, 4;
- or.b32 %r217, %r216, 15;
- and.b32 %r4, %r217, -16;
- add.s32 %r218, %r217, %r4;
- and.b32 %r219, %r218, -16;
- cvt.s64.s32 %rd2, %r219;
+ shr.s32 %r2, %r209, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r210, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r211, %r4, 2;
+ mad.lo.s32 %r212, %r211, %r210, 15;
+ and.b32 %r213, %r212, -16;
+ cvt.u64.u32 %rd1, %r213;
+ mul.lo.s32 %r214, %r4, %r2;
+ shl.b32 %r215, %r214, 4;
+ or.b32 %r216, %r215, 15;
+ and.b32 %r5, %r216, -16;
+ add.s32 %r217, %r216, %r5;
+ and.b32 %r218, %r217, -16;
+ cvt.s64.s32 %rd2, %r218;
mov.u64 %rd45, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p7, %r5, %r210;
- shl.b32 %r6, %r5, 2;
- or.b32 %r220, %r6, 3;
- setp.lt.s32 %p8, %r220, %r171;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p7, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r219, %r7, 3;
+ setp.lt.s32 %p8, %r219, %r171;
and.pred %p1, %p8, %p7;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p9, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p9, %r8, 0;
and.pred %p2, %p9, %p1;
not.pred %p10, %p2;
@%p10 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r221, smem_ptr; }
-
-
- shl.b32 %r224, %r5, 4;
- add.s32 %r222, %r221, %r224;
- mul.wide.s32 %rd49, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r220, smem_ptr; }
+
+
+ shl.b32 %r223, %r6, 4;
+ add.s32 %r221, %r220, %r223;
+ mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd38, %rd49;
- mov.u32 %r223, 0;
+ mov.u32 %r222, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r223, 0;
- cp.async.ca.shared.global [%r222], [%rd48], 16, p0;
+ setp.ne.b32 p0, %r222, 0;
+ cp.async.ca.shared.global [%r221], [%rd48], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r225, %r3, 215;
- div.s32 %r226, %r225, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r227, %r8, %r226;
- add.s32 %r228, %r227, -1;
- div.s32 %r9, %r228, %r8;
- setp.gt.s32 %p11, %r9, 0;
+ add.s32 %r224, %r4, 215;
+ div.s32 %r225, %r224, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r226, %r9, %r225;
+ add.s32 %r227, %r226, -1;
+ div.s32 %r10, %r227, %r9;
+ setp.gt.s32 %p11, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p11 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r171;
- cvt.s64.s32 %rd50, %r4;
+ cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
- mov.u32 %r230, %ctaid.y;
- mul.lo.s32 %r231, %r9, %r3;
- mul.lo.s32 %r10, %r231, %r230;
- shl.b32 %r232, %r7, 2;
- shl.b32 %r233, %r5, 4;
- mad.lo.s32 %r11, %r232, %r171, %r233;
- mul.lo.s32 %r234, %r171, %r7;
- cvt.s64.s32 %rd54, %r234;
- cvt.s64.s32 %rd55, %r6;
+ mov.u32 %r229, %ctaid.y;
+ mul.lo.s32 %r230, %r10, %r4;
+ mul.lo.s32 %r11, %r230, %r229;
+ mad.lo.s32 %r231, %r2, %r8, %r6;
+ shl.b32 %r12, %r231, 4;
+ mul.lo.s32 %r232, %r171, %r8;
+ cvt.s64.s32 %rd54, %r232;
+ cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
- mul.lo.s32 %r235, %r10, %r171;
- cvt.s64.s32 %rd6, %r235;
- mul.lo.s32 %r12, %r171, %r3;
- mul.lo.s32 %r13, %r9, %r230;
- add.s32 %r14, %r234, %r6;
+ mul.lo.s32 %r233, %r11, %r171;
+ cvt.s64.s32 %rd6, %r233;
+ mul.lo.s32 %r13, %r171, %r4;
+ mul.lo.s32 %r14, %r10, %r229;
+ shl.b32 %r234, %r8, 2;
+ mad.lo.s32 %r235, %r234, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
- mul.wide.s32 %rd57, %r14, 4;
+ mul.wide.s32 %rd57, %r235, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r236, %tid.z;
- mad.lo.s32 %r237, %r3, %r236, %r7;
- mad.lo.s32 %r15, %r237, %r2, %r5;
+ mad.lo.s32 %r237, %r4, %r236, %r8;
+ mad.lo.s32 %r15, %r237, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
- clz.b32 %r238, %r2;
+ clz.b32 %r238, %r3;
mov.u32 %r239, 31;
sub.s32 %r240, %r239, %r238;
mov.u32 %r241, 1;
shl.b32 %r16, %r241, %r240;
- setp.lt.u32 %p12, %r5, %r16;
- add.s32 %r242, %r16, %r5;
- setp.lt.u32 %p13, %r242, %r2;
+ setp.lt.u32 %p12, %r6, %r16;
+ add.s32 %r242, %r16, %r6;
+ setp.lt.u32 %p13, %r242, %r3;
and.pred %p3, %p12, %p13;
add.s32 %r243, %r15, %r16;
mul.wide.s32 %rd59, %r243, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r244, %r16, 31;
@@ -162,31 +162,31 @@
add.s64 %rd10, %rd53, %rd57;
add.s32 %r246, %r15, 1;
mul.wide.u32 %rd60, %r246, 4;
add.s64 %rd11, %rd45, %rd60;
add.s64 %rd61, %rd45, %rd4;
- mul.wide.s32 %rd62, %r6, 4;
+ mul.wide.s32 %rd62, %r7, 4;
add.s64 %rd12, %rd61, %rd62;
mul.wide.s32 %rd63, %r237, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd37;
add.s64 %rd19, %rd46, %rd51;
- mov.u32 %r394, 0;
+ mov.u32 %r392, 0;
mov.f32 %f354, 0f00000000;
not.pred %p14, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r249, smem_ptr; }
- add.s32 %r250, %r11, %r249;
+ add.s32 %r250, %r249, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r275, smem_ptr; }
- add.s32 %r276, %r11, %r275;
+ add.s32 %r276, %r275, %r12;
not.pred %p24, %p3;
mov.f32 %f355, %f354;
mov.f32 %f356, %f354;
mov.f32 %f357, %f354;
mov.f32 %f366, %f354;
@@ -196,16 +196,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p14 bra $L__BB0_8;
- mad.lo.s32 %r247, %r394, %r3, %r7;
- add.s32 %r248, %r247, %r10;
+ mad.lo.s32 %r247, %r392, %r4, %r8;
+ add.s32 %r248, %r247, %r11;
setp.gt.s32 %p15, %r248, 215;
@%p15 bra $L__BB0_8;
- mul.lo.s32 %r252, %r12, %r394;
+ mul.lo.s32 %r252, %r13, %r392;
cvt.s64.s32 %rd67, %r252;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd34, %rd70;
@@ -224,53 +224,53 @@
cp.async.wait_all;
@%p14 bra $L__BB0_10;
- add.s32 %r253, %r13, %r394;
- mad.lo.s32 %r254, %r253, %r3, %r7;
+ add.s32 %r253, %r14, %r392;
+ mad.lo.s32 %r254, %r253, %r4, %r8;
setp.lt.s32 %p17, %r254, 216;
@%p17 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r395, %r396, %r397, %r398}, [%rd7];
+ ld.shared.v4.u32 {%r393, %r394, %r395, %r396}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r263, %r13, %r394;
- mad.lo.s32 %r264, %r263, %r3, %r7;
+ add.s32 %r263, %r14, %r392;
+ mad.lo.s32 %r264, %r263, %r4, %r8;
setp.gt.s32 %p18, %r264, 215;
- mov.u32 %r395, 0;
- mov.u32 %r396, %r395;
- mov.u32 %r397, %r395;
- mov.u32 %r398, %r395;
+ mov.u32 %r393, 0;
+ mov.u32 %r394, %r393;
+ mov.u32 %r395, %r393;
+ mov.u32 %r396, %r393;
@%p18 bra $L__BB0_15;
- ld.shared.v4.u32 {%r395, %r396, %r397, %r398}, [%rd7];
+ ld.shared.v4.u32 {%r393, %r394, %r395, %r396}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r395, 0;
- mov.u32 %r396, %r395;
- mov.u32 %r397, %r395;
- mov.u32 %r398, %r395;
+ mov.u32 %r393, 0;
+ mov.u32 %r394, %r393;
+ mov.u32 %r395, %r393;
+ mov.u32 %r396, %r393;
$L__BB0_15:
- add.s32 %r273, %r13, %r394;
- mad.lo.s32 %r33, %r273, %r3, %r7;
- mov.b32 %f112, %r398;
+ add.s32 %r273, %r14, %r392;
+ mad.lo.s32 %r33, %r273, %r4, %r8;
+ mov.b32 %f112, %r396;
add.f32 %f369, %f369, %f112;
- mov.b32 %f113, %r397;
+ mov.b32 %f113, %r395;
add.f32 %f368, %f368, %f113;
- mov.b32 %f114, %r396;
+ mov.b32 %f114, %r394;
add.f32 %f367, %f367, %f114;
- mov.b32 %f115, %r395;
+ mov.b32 %f115, %r393;
add.f32 %f366, %f366, %f115;
setp.gt.s32 %p19, %r33, 215;
mov.f32 %f352, 0f00000000;
@%p19 bra $L__BB0_17;
@@ -283,11 +283,11 @@
setp.lt.s32 %p20, %r33, 216;
and.pred %p4, %p1, %p20;
not.pred %p21, %p4;
@%p21 bra $L__BB0_19;
- mul.lo.s32 %r278, %r12, %r394;
+ mul.lo.s32 %r278, %r13, %r392;
cvt.s64.s32 %rd75, %r278;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd35, %rd78;
@@ -300,13 +300,13 @@
}
$L__BB0_19:
- add.s32 %r393, %r13, %r394;
- mad.lo.s32 %r392, %r393, %r3, %r7;
- setp.gt.s32 %p139, %r392, 215;
+ add.s32 %r391, %r14, %r392;
+ mad.lo.s32 %r390, %r391, %r4, %r8;
+ setp.gt.s32 %p139, %r390, 215;
mov.f32 %f358, 0f00000000;
mov.f32 %f353, %f358;
@%p139 bra $L__BB0_21;
mul.lo.s32 %r279, %r33, %r184;
@@ -363,37 +363,37 @@
$L__BB0_25:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_30;
- mov.u32 %r399, %r17;
+ mov.u32 %r397, %r17;
$L__BB0_27:
- setp.ge.u32 %p26, %r5, %r399;
+ setp.ge.u32 %p26, %r6, %r397;
@%p26 bra $L__BB0_29;
- add.s32 %r280, %r399, %r15;
+ add.s32 %r280, %r397, %r15;
mul.wide.s32 %rd81, %r280, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd83];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
- shr.u32 %r35, %r399, 1;
- setp.gt.u32 %p27, %r399, 3;
- mov.u32 %r399, %r35;
+ shr.u32 %r35, %r397, 1;
+ setp.gt.u32 %p27, %r397, 3;
+ mov.u32 %r397, %r35;
@%p27 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p28, %r5, 0;
+ setp.ne.s32 %p28, %r6, 0;
mov.f32 %f360, 0f00000000;
@%p28 bra $L__BB0_33;
- setp.lt.u32 %p29, %r2, 2;
+ setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f360, %f168, 0f00000000;
@%p29 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
@@ -413,36 +413,36 @@
$L__BB0_35:
setp.lt.s32 %p136, %r16, 4;
bar.sync 0;
@%p136 bra $L__BB0_40;
- mov.u32 %r400, %r17;
+ mov.u32 %r398, %r17;
$L__BB0_37:
- setp.ge.u32 %p32, %r5, %r400;
+ setp.ge.u32 %p32, %r6, %r398;
@%p32 bra $L__BB0_39;
- add.s32 %r281, %r400, %r15;
+ add.s32 %r281, %r398, %r15;
mul.wide.s32 %rd84, %r281, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd86];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
- shr.u32 %r37, %r400, 1;
- setp.gt.u32 %p33, %r400, 3;
- mov.u32 %r400, %r37;
+ shr.u32 %r37, %r398, 1;
+ setp.gt.u32 %p33, %r398, 3;
+ mov.u32 %r398, %r37;
@%p33 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f361, 0f00000000;
@%p28 bra $L__BB0_43;
- setp.lt.u32 %p35, %r2, 2;
+ setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f361, %f177, 0f00000000;
@%p35 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
@@ -506,21 +506,20 @@
sub.f32 %f232, %f228, %f37;
mul.f32 %f233, %f38, %f231;
sub.f32 %f234, %f232, %f233;
mul.f32 %f235, %f179, %f234;
mov.b32 %r285, %f235;
- mad.lo.s32 %r286, %r394, %r3, %r10;
- mad.lo.s32 %r287, %r286, %r171, %r14;
- mul.wide.s32 %rd88, %r287, 4;
+ mad.lo.s32 %r286, %r33, %r171, %r7;
+ mul.wide.s32 %rd88, %r286, 4;
add.s64 %rd87, %rd39, %rd88;
st.global.cs.v4.s32 [%rd87], {%r282,%r283,%r284,%r285};
$L__BB0_49:
- add.s32 %r394, %r394, 1;
- setp.lt.s32 %p39, %r394, %r9;
+ add.s32 %r392, %r392, 1;
+ setp.lt.s32 %p39, %r392, %r10;
@%p39 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f354, 0f00000000;
@@ -531,31 +530,31 @@
mov.f32 %f367, %f354;
mov.f32 %f368, %f354;
mov.f32 %f369, %f354;
$L__BB0_50:
- mov.u32 %r288, %tid.z;
- mad.lo.s32 %r289, %r3, %r288, %r7;
- mad.lo.s32 %r39, %r289, %r2, %r5;
+ mov.u32 %r287, %tid.z;
+ mad.lo.s32 %r288, %r4, %r287, %r8;
+ mad.lo.s32 %r39, %r288, %r3, %r6;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
- clz.b32 %r290, %r3;
- mov.u32 %r291, 31;
- sub.s32 %r292, %r291, %r290;
- mov.u32 %r293, 1;
- shl.b32 %r40, %r293, %r292;
- setp.lt.u32 %p40, %r7, %r40;
- add.s32 %r294, %r40, %r7;
- setp.lt.u32 %p41, %r294, %r3;
+ clz.b32 %r289, %r4;
+ mov.u32 %r290, 31;
+ sub.s32 %r291, %r290, %r289;
+ mov.u32 %r292, 1;
+ shl.b32 %r40, %r292, %r291;
+ setp.lt.u32 %p40, %r8, %r40;
+ add.s32 %r293, %r40, %r8;
+ setp.lt.u32 %p41, %r293, %r4;
and.pred %p5, %p40, %p41;
- shl.b32 %r295, %r2, %r292;
- add.s32 %r296, %r39, %r295;
- mul.wide.s32 %rd91, %r296, 4;
+ shl.b32 %r294, %r3, %r291;
+ add.s32 %r295, %r39, %r294;
+ mul.wide.s32 %rd91, %r295, 4;
add.s64 %rd24, %rd45, %rd91;
- shr.u32 %r297, %r40, 31;
- add.s32 %r298, %r40, %r297;
- shr.s32 %r415, %r298, 1;
+ shr.u32 %r296, %r40, 31;
+ add.s32 %r297, %r40, %r296;
+ shr.s32 %r413, %r297, 1;
st.shared.f32 [%rd23], %f354;
bar.sync 0;
not.pred %p42, %p5;
@%p42 bra $L__BB0_52;
@@ -567,49 +566,49 @@
$L__BB0_52:
setp.lt.s32 %p43, %r40, 4;
bar.sync 0;
@%p43 bra $L__BB0_57;
- mov.u32 %r401, %r415;
+ mov.u32 %r399, %r413;
$L__BB0_54:
- setp.ge.u32 %p44, %r7, %r401;
+ setp.ge.u32 %p44, %r8, %r399;
@%p44 bra $L__BB0_56;
- mad.lo.s32 %r299, %r401, %r2, %r39;
- mul.wide.s32 %rd92, %r299, 4;
+ mad.lo.s32 %r298, %r399, %r3, %r39;
+ mul.wide.s32 %rd92, %r298, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f239, [%rd23];
ld.shared.f32 %f240, [%rd94];
add.f32 %f241, %f240, %f239;
st.shared.f32 [%rd23], %f241;
$L__BB0_56:
bar.sync 0;
- shr.u32 %r43, %r401, 1;
- setp.gt.u32 %p45, %r401, 3;
- mov.u32 %r401, %r43;
+ shr.u32 %r43, %r399, 1;
+ setp.gt.u32 %p45, %r399, 3;
+ mov.u32 %r399, %r43;
@%p45 bra $L__BB0_54;
$L__BB0_57:
- add.s32 %r301, %r39, %r2;
- mul.wide.u32 %rd95, %r301, 4;
+ add.s32 %r300, %r39, %r3;
+ mul.wide.u32 %rd95, %r300, 4;
add.s64 %rd25, %rd45, %rd95;
- setp.ne.s32 %p46, %r7, 0;
- mov.u32 %r402, 0;
+ setp.ne.s32 %p46, %r8, 0;
+ mov.u32 %r400, 0;
@%p46 bra $L__BB0_61;
- setp.lt.u32 %p47, %r3, 2;
+ setp.lt.u32 %p47, %r4, 2;
ld.shared.f32 %f242, [%rd23];
add.f32 %f370, %f242, 0f00000000;
@%p47 bra $L__BB0_60;
ld.shared.f32 %f243, [%rd25];
add.f32 %f370, %f370, %f243;
$L__BB0_60:
- mov.b32 %r402, %f370;
+ mov.b32 %r400, %f370;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f355;
bar.sync 0;
@@ -622,45 +621,45 @@
$L__BB0_63:
bar.sync 0;
@%p43 bra $L__BB0_68;
- mov.u32 %r403, %r415;
+ mov.u32 %r401, %r413;
$L__BB0_65:
- setp.ge.u32 %p50, %r7, %r403;
+ setp.ge.u32 %p50, %r8, %r401;
@%p50 bra $L__BB0_67;
- mad.lo.s32 %r302, %r403, %r2, %r39;
- mul.wide.s32 %rd97, %r302, 4;
+ mad.lo.s32 %r301, %r401, %r3, %r39;
+ mul.wide.s32 %rd97, %r301, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f247, [%rd23];
ld.shared.f32 %f248, [%rd99];
add.f32 %f249, %f248, %f247;
st.shared.f32 [%rd23], %f249;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r47, %r403, 1;
- setp.gt.u32 %p51, %r403, 3;
- mov.u32 %r403, %r47;
+ shr.u32 %r47, %r401, 1;
+ setp.gt.u32 %p51, %r401, 3;
+ mov.u32 %r401, %r47;
@%p51 bra $L__BB0_65;
$L__BB0_68:
- mov.u32 %r404, 0;
+ mov.u32 %r402, 0;
@%p46 bra $L__BB0_72;
- setp.lt.u32 %p53, %r3, 2;
+ setp.lt.u32 %p53, %r4, 2;
ld.shared.f32 %f250, [%rd23];
add.f32 %f371, %f250, 0f00000000;
@%p53 bra $L__BB0_71;
ld.shared.f32 %f251, [%rd25];
add.f32 %f371, %f371, %f251;
$L__BB0_71:
- mov.b32 %r404, %f371;
+ mov.b32 %r402, %f371;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f356;
bar.sync 0;
@@ -673,45 +672,45 @@
$L__BB0_74:
bar.sync 0;
@%p43 bra $L__BB0_79;
- mov.u32 %r405, %r415;
+ mov.u32 %r403, %r413;
$L__BB0_76:
- setp.ge.u32 %p56, %r7, %r405;
+ setp.ge.u32 %p56, %r8, %r403;
@%p56 bra $L__BB0_78;
- mad.lo.s32 %r304, %r405, %r2, %r39;
- mul.wide.s32 %rd100, %r304, 4;
+ mad.lo.s32 %r303, %r403, %r3, %r39;
+ mul.wide.s32 %rd100, %r303, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f255, [%rd23];
ld.shared.f32 %f256, [%rd102];
add.f32 %f257, %f256, %f255;
st.shared.f32 [%rd23], %f257;
$L__BB0_78:
bar.sync 0;
- shr.u32 %r51, %r405, 1;
- setp.gt.u32 %p57, %r405, 3;
- mov.u32 %r405, %r51;
+ shr.u32 %r51, %r403, 1;
+ setp.gt.u32 %p57, %r403, 3;
+ mov.u32 %r403, %r51;
@%p57 bra $L__BB0_76;
$L__BB0_79:
- mov.u32 %r406, 0;
+ mov.u32 %r404, 0;
@%p46 bra $L__BB0_83;
- setp.lt.u32 %p59, %r3, 2;
+ setp.lt.u32 %p59, %r4, 2;
ld.shared.f32 %f258, [%rd23];
add.f32 %f372, %f258, 0f00000000;
@%p59 bra $L__BB0_82;
ld.shared.f32 %f259, [%rd25];
add.f32 %f372, %f372, %f259;
$L__BB0_82:
- mov.b32 %r406, %f372;
+ mov.b32 %r404, %f372;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f357;
bar.sync 0;
@@ -724,45 +723,45 @@
$L__BB0_85:
bar.sync 0;
@%p43 bra $L__BB0_90;
- mov.u32 %r407, %r415;
+ mov.u32 %r405, %r413;
$L__BB0_87:
- setp.ge.u32 %p62, %r7, %r407;
+ setp.ge.u32 %p62, %r8, %r405;
@%p62 bra $L__BB0_89;
- mad.lo.s32 %r306, %r407, %r2, %r39;
- mul.wide.s32 %rd103, %r306, 4;
+ mad.lo.s32 %r305, %r405, %r3, %r39;
+ mul.wide.s32 %rd103, %r305, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f263, [%rd23];
ld.shared.f32 %f264, [%rd105];
add.f32 %f265, %f264, %f263;
st.shared.f32 [%rd23], %f265;
$L__BB0_89:
bar.sync 0;
- shr.u32 %r55, %r407, 1;
- setp.gt.u32 %p63, %r407, 3;
- mov.u32 %r407, %r55;
+ shr.u32 %r55, %r405, 1;
+ setp.gt.u32 %p63, %r405, 3;
+ mov.u32 %r405, %r55;
@%p63 bra $L__BB0_87;
$L__BB0_90:
- mov.u32 %r408, 0;
+ mov.u32 %r406, 0;
@%p46 bra $L__BB0_94;
- setp.lt.u32 %p65, %r3, 2;
+ setp.lt.u32 %p65, %r4, 2;
ld.shared.f32 %f266, [%rd23];
add.f32 %f373, %f266, 0f00000000;
@%p65 bra $L__BB0_93;
ld.shared.f32 %f267, [%rd25];
add.f32 %f373, %f373, %f267;
$L__BB0_93:
- mov.b32 %r408, %f373;
+ mov.b32 %r406, %f373;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f366;
bar.sync 0;
@@ -775,45 +774,45 @@
$L__BB0_96:
bar.sync 0;
@%p43 bra $L__BB0_101;
- mov.u32 %r409, %r415;
+ mov.u32 %r407, %r413;
$L__BB0_98:
- setp.ge.u32 %p68, %r7, %r409;
+ setp.ge.u32 %p68, %r8, %r407;
@%p68 bra $L__BB0_100;
- mad.lo.s32 %r308, %r409, %r2, %r39;
- mul.wide.s32 %rd106, %r308, 4;
+ mad.lo.s32 %r307, %r407, %r3, %r39;
+ mul.wide.s32 %rd106, %r307, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f271, [%rd23];
ld.shared.f32 %f272, [%rd108];
add.f32 %f273, %f272, %f271;
st.shared.f32 [%rd23], %f273;
$L__BB0_100:
bar.sync 0;
- shr.u32 %r59, %r409, 1;
- setp.gt.u32 %p69, %r409, 3;
- mov.u32 %r409, %r59;
+ shr.u32 %r59, %r407, 1;
+ setp.gt.u32 %p69, %r407, 3;
+ mov.u32 %r407, %r59;
@%p69 bra $L__BB0_98;
$L__BB0_101:
- mov.u32 %r410, 0;
+ mov.u32 %r408, 0;
@%p46 bra $L__BB0_105;
- setp.lt.u32 %p71, %r3, 2;
+ setp.lt.u32 %p71, %r4, 2;
ld.shared.f32 %f274, [%rd23];
add.f32 %f374, %f274, 0f00000000;
@%p71 bra $L__BB0_104;
ld.shared.f32 %f275, [%rd25];
add.f32 %f374, %f374, %f275;
$L__BB0_104:
- mov.b32 %r410, %f374;
+ mov.b32 %r408, %f374;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f367;
bar.sync 0;
@@ -826,45 +825,45 @@
$L__BB0_107:
bar.sync 0;
@%p43 bra $L__BB0_112;
- mov.u32 %r411, %r415;
+ mov.u32 %r409, %r413;
$L__BB0_109:
- setp.ge.u32 %p74, %r7, %r411;
+ setp.ge.u32 %p74, %r8, %r409;
@%p74 bra $L__BB0_111;
- mad.lo.s32 %r310, %r411, %r2, %r39;
- mul.wide.s32 %rd109, %r310, 4;
+ mad.lo.s32 %r309, %r409, %r3, %r39;
+ mul.wide.s32 %rd109, %r309, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f279, [%rd23];
ld.shared.f32 %f280, [%rd111];
add.f32 %f281, %f280, %f279;
st.shared.f32 [%rd23], %f281;
$L__BB0_111:
bar.sync 0;
- shr.u32 %r63, %r411, 1;
- setp.gt.u32 %p75, %r411, 3;
- mov.u32 %r411, %r63;
+ shr.u32 %r63, %r409, 1;
+ setp.gt.u32 %p75, %r409, 3;
+ mov.u32 %r409, %r63;
@%p75 bra $L__BB0_109;
$L__BB0_112:
- mov.u32 %r412, 0;
+ mov.u32 %r410, 0;
@%p46 bra $L__BB0_116;
- setp.lt.u32 %p77, %r3, 2;
+ setp.lt.u32 %p77, %r4, 2;
ld.shared.f32 %f282, [%rd23];
add.f32 %f375, %f282, 0f00000000;
@%p77 bra $L__BB0_115;
ld.shared.f32 %f283, [%rd25];
add.f32 %f375, %f375, %f283;
$L__BB0_115:
- mov.b32 %r412, %f375;
+ mov.b32 %r410, %f375;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@@ -877,45 +876,45 @@
$L__BB0_118:
bar.sync 0;
@%p43 bra $L__BB0_123;
- mov.u32 %r413, %r415;
+ mov.u32 %r411, %r413;
$L__BB0_120:
- setp.ge.u32 %p80, %r7, %r413;
+ setp.ge.u32 %p80, %r8, %r411;
@%p80 bra $L__BB0_122;
- mad.lo.s32 %r312, %r413, %r2, %r39;
- mul.wide.s32 %rd112, %r312, 4;
+ mad.lo.s32 %r311, %r411, %r3, %r39;
+ mul.wide.s32 %rd112, %r311, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f287, [%rd23];
ld.shared.f32 %f288, [%rd114];
add.f32 %f289, %f288, %f287;
st.shared.f32 [%rd23], %f289;
$L__BB0_122:
bar.sync 0;
- shr.u32 %r67, %r413, 1;
- setp.gt.u32 %p81, %r413, 3;
- mov.u32 %r413, %r67;
+ shr.u32 %r67, %r411, 1;
+ setp.gt.u32 %p81, %r411, 3;
+ mov.u32 %r411, %r67;
@%p81 bra $L__BB0_120;
$L__BB0_123:
- mov.u32 %r414, 0;
+ mov.u32 %r412, 0;
@%p46 bra $L__BB0_127;
- setp.lt.u32 %p83, %r3, 2;
+ setp.lt.u32 %p83, %r4, 2;
ld.shared.f32 %f290, [%rd23];
add.f32 %f376, %f290, 0f00000000;
@%p83 bra $L__BB0_126;
ld.shared.f32 %f291, [%rd25];
add.f32 %f376, %f376, %f291;
$L__BB0_126:
- mov.b32 %r414, %f376;
+ mov.b32 %r412, %f376;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@@ -929,197 +928,196 @@
$L__BB0_129:
bar.sync 0;
@%p43 bra $L__BB0_133;
$L__BB0_130:
- setp.ge.u32 %p86, %r7, %r415;
+ setp.ge.u32 %p86, %r8, %r413;
@%p86 bra $L__BB0_132;
- mad.lo.s32 %r314, %r415, %r2, %r39;
- mul.wide.s32 %rd115, %r314, 4;
+ mad.lo.s32 %r313, %r413, %r3, %r39;
+ mul.wide.s32 %rd115, %r313, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f295, [%rd23];
ld.shared.f32 %f296, [%rd117];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd23], %f297;
$L__BB0_132:
bar.sync 0;
- shr.u32 %r71, %r415, 1;
- setp.gt.u32 %p87, %r415, 3;
- mov.u32 %r415, %r71;
+ shr.u32 %r71, %r413, 1;
+ setp.gt.u32 %p87, %r413, 3;
+ mov.u32 %r413, %r71;
@%p87 bra $L__BB0_130;
$L__BB0_133:
- mov.u32 %r416, 0;
+ mov.u32 %r414, 0;
@%p46 bra $L__BB0_137;
- setp.lt.u32 %p89, %r3, 2;
+ setp.lt.u32 %p89, %r4, 2;
ld.shared.f32 %f298, [%rd23];
add.f32 %f377, %f298, 0f00000000;
@%p89 bra $L__BB0_136;
ld.shared.f32 %f299, [%rd25];
add.f32 %f377, %f377, %f299;
$L__BB0_136:
- mov.b32 %r416, %f377;
+ mov.b32 %r414, %f377;
$L__BB0_137:
- setp.eq.s32 %p138, %r7, 0;
+ setp.eq.s32 %p138, %r8, 0;
and.pred %p137, %p138, %p1;
bar.sync 0;
@%p137 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
- shl.b32 %r391, %r5, 2;
- mov.u32 %r324, %ctaid.y;
- mad.lo.s32 %r325, %r171, %r324, %r391;
- mul.wide.s32 %rd120, %r325, 4;
+ mov.u32 %r323, %ctaid.y;
+ mad.lo.s32 %r324, %r171, %r323, %r7;
+ mul.wide.s32 %rd120, %r324, 4;
add.s64 %rd118, %rd42, %rd120;
- st.volatile.global.v4.s32 [%rd118], {%r402,%r404,%r406,%r408};
+ st.volatile.global.v4.s32 [%rd118], {%r400,%r402,%r404,%r406};
add.s64 %rd119, %rd43, %rd120;
- st.volatile.global.v4.s32 [%rd119], {%r410,%r412,%r414,%r416};
+ st.volatile.global.v4.s32 [%rd119], {%r408,%r410,%r412,%r414};
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r326, %r5, %r7;
- or.b32 %r328, %r326, %r288;
- setp.ne.s32 %p90, %r328, 0;
+ or.b32 %r325, %r6, %r8;
+ or.b32 %r327, %r325, %r287;
+ setp.ne.s32 %p90, %r327, 0;
@%p90 bra $L__BB0_143;
ld.param.u64 %rd156, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd121, %rd156;
- mov.u32 %r329, %ctaid.x;
- mov.u32 %r330, %ctaid.z;
- mov.u32 %r331, %nctaid.x;
- mad.lo.s32 %r332, %r330, %r331, %r329;
- mul.wide.s32 %rd122, %r332, 8;
+ mov.u32 %r328, %ctaid.x;
+ mov.u32 %r329, %ctaid.z;
+ mov.u32 %r330, %nctaid.x;
+ mad.lo.s32 %r331, %r329, %r330, %r328;
+ mul.wide.s32 %rd122, %r331, 8;
add.s64 %rd28, %rd121, %rd122;
- add.s32 %r333, %r8, -1;
- setp.eq.s32 %p91, %r74, %r333;
- cvt.s64.s32 %rd123, %r8;
+ add.s32 %r332, %r9, -1;
+ setp.eq.s32 %p91, %r74, %r332;
+ cvt.s64.s32 %rd123, %r9;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p91;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p92, %rd128, 0;
@%p92 bra $L__BB0_143;
- mov.u32 %r417, 8;
+ mov.u32 %r415, 8;
$L__BB0_142:
- nanosleep.u32 %r417;
-
- setp.lt.u32 %p93, %r417, 256;
- selp.u32 %r336, 1, 0, %p93;
- shl.b32 %r417, %r417, %r336;
+ nanosleep.u32 %r415;
+
+ setp.lt.u32 %p93, %r415, 256;
+ selp.u32 %r335, 1, 0, %p93;
+ shl.b32 %r415, %r415, %r335;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p94, %rd130, -1;
@%p94 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
- mov.u32 %r338, 1;
- add.s32 %r339, %r171, 1;
- shr.u32 %r340, %r339, 31;
- add.s32 %r341, %r339, %r340;
- shr.s32 %r342, %r341, 1;
- add.s32 %r343, %r3, %r342;
- add.s32 %r344, %r343, -1;
- div.s32 %r345, %r344, %r3;
- add.s32 %r346, %r8, -1;
- add.s32 %r347, %r346, %r345;
- div.s32 %r77, %r347, %r8;
- add.s32 %r78, %r346, %r2;
- shl.b32 %r79, %r7, 1;
- shl.b32 %r348, %r3, 1;
- mad.lo.s32 %r82, %r348, %r74, %r79;
+ mov.u32 %r337, 1;
+ add.s32 %r338, %r171, 1;
+ shr.u32 %r339, %r338, 31;
+ add.s32 %r340, %r338, %r339;
+ shr.s32 %r341, %r340, 1;
+ add.s32 %r342, %r4, %r341;
+ add.s32 %r343, %r342, -1;
+ div.s32 %r344, %r343, %r4;
+ add.s32 %r345, %r9, -1;
+ add.s32 %r346, %r345, %r344;
+ div.s32 %r77, %r346, %r9;
+ add.s32 %r78, %r345, %r3;
+ shl.b32 %r79, %r8, 1;
+ shl.b32 %r347, %r4, 1;
+ mad.lo.s32 %r82, %r347, %r74, %r79;
or.b32 %r80, %r82, 1;
- mul.lo.s32 %r81, %r348, %r8;
- clz.b32 %r349, %r2;
- mov.u32 %r350, 31;
- sub.s32 %r351, %r350, %r349;
- shl.b32 %r83, %r338, %r351;
- setp.lt.u32 %p95, %r5, %r83;
- add.s32 %r352, %r83, %r5;
- setp.lt.u32 %p96, %r352, %r2;
+ mul.lo.s32 %r81, %r347, %r9;
+ clz.b32 %r348, %r3;
+ mov.u32 %r349, 31;
+ sub.s32 %r350, %r349, %r348;
+ shl.b32 %r83, %r337, %r350;
+ setp.lt.u32 %p95, %r6, %r83;
+ add.s32 %r351, %r83, %r6;
+ setp.lt.u32 %p96, %r351, %r3;
and.pred %p6, %p95, %p96;
- add.s32 %r353, %r39, %r83;
- mul.wide.s32 %rd131, %r353, 4;
+ add.s32 %r352, %r39, %r83;
+ mul.wide.s32 %rd131, %r352, 4;
add.s64 %rd30, %rd45, %rd131;
- shr.u32 %r354, %r83, 31;
- add.s32 %r355, %r83, %r354;
- shr.s32 %r84, %r355, 1;
- add.s32 %r356, %r39, 1;
- mul.wide.u32 %rd133, %r356, 4;
+ shr.u32 %r353, %r83, 31;
+ add.s32 %r354, %r83, %r353;
+ shr.s32 %r84, %r354, 1;
+ add.s32 %r355, %r39, 1;
+ mul.wide.u32 %rd133, %r355, 4;
add.s64 %rd31, %rd45, %rd133;
- mov.u32 %r418, 0;
+ mov.u32 %r416, 0;
not.pred %p122, %p6;
bra.uni $L__BB0_144;
$L__BB0_211:
- add.s32 %r418, %r418, 1;
+ add.s32 %r416, %r416, 1;
$L__BB0_144:
.pragma "nounroll";
- setp.lt.s32 %p97, %r418, %r77;
+ setp.lt.s32 %p97, %r416, %r77;
@%p97 bra $L__BB0_180;
bra.uni $L__BB0_145;
$L__BB0_180:
- div.s32 %r114, %r78, %r2;
+ div.s32 %r114, %r78, %r3;
setp.lt.s32 %p118, %r114, 1;
mov.f32 %f386, 0f00000000;
mov.f32 %f387, %f386;
@%p118 bra $L__BB0_186;
- mul.lo.s32 %r375, %r81, %r418;
- add.s32 %r115, %r80, %r375;
- add.s32 %r116, %r82, %r375;
- mov.u32 %r374, 0;
+ mul.lo.s32 %r374, %r81, %r416;
+ add.s32 %r115, %r80, %r374;
+ add.s32 %r116, %r82, %r374;
+ mov.u32 %r373, 0;
mov.f32 %f386, 0f00000000;
- mov.u32 %r429, %r374;
+ mov.u32 %r427, %r373;
$L__BB0_182:
.pragma "nounroll";
setp.ge.s32 %p119, %r115, %r171;
- mov.u32 %r430, %r374;
- mov.u32 %r431, %r374;
+ mov.u32 %r428, %r373;
+ mov.u32 %r429, %r373;
@%p119 bra $L__BB0_185;
- mad.lo.s32 %r118, %r429, %r2, %r5;
- setp.ge.s32 %p120, %r118, %r8;
- mov.u32 %r430, %r374;
- mov.u32 %r431, %r374;
+ mad.lo.s32 %r118, %r427, %r3, %r6;
+ setp.ge.s32 %p120, %r118, %r9;
+ mov.u32 %r428, %r373;
+ mov.u32 %r429, %r373;
@%p120 bra $L__BB0_185;
- mad.lo.s32 %r382, %r118, %r171, %r116;
- mul.wide.s32 %rd145, %r382, 4;
+ mad.lo.s32 %r381, %r118, %r171, %r116;
+ mul.wide.s32 %rd145, %r381, 4;
add.s64 %rd144, %rd43, %rd145;
- ld.volatile.global.v2.s32 {%r431,%r430}, [%rd144];
+ ld.volatile.global.v2.s32 {%r429,%r428}, [%rd144];
$L__BB0_185:
- mov.b32 %f326, %r431;
+ mov.b32 %f326, %r429;
add.f32 %f387, %f387, %f326;
- mov.b32 %f327, %r430;
+ mov.b32 %f327, %r428;
add.f32 %f386, %f386, %f327;
- add.s32 %r429, %r429, 1;
- setp.lt.s32 %p121, %r429, %r114;
+ add.s32 %r427, %r427, 1;
+ setp.lt.s32 %p121, %r427, %r114;
@%p121 bra $L__BB0_182;
$L__BB0_186:
st.shared.f32 [%rd23], %f387;
bar.sync 0;
@@ -1133,46 +1131,46 @@
$L__BB0_188:
setp.lt.s32 %p123, %r83, 4;
bar.sync 0;
@%p123 bra $L__BB0_193;
- mov.u32 %r432, %r84;
+ mov.u32 %r430, %r84;
$L__BB0_190:
- setp.ge.u32 %p124, %r5, %r432;
+ setp.ge.u32 %p124, %r6, %r430;
@%p124 bra $L__BB0_192;
- add.s32 %r383, %r432, %r39;
- mul.wide.s32 %rd146, %r383, 4;
+ add.s32 %r382, %r430, %r39;
+ mul.wide.s32 %rd146, %r382, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f331, [%rd23];
ld.shared.f32 %f332, [%rd148];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd23], %f333;
$L__BB0_192:
bar.sync 0;
- shr.u32 %r125, %r432, 1;
- setp.gt.u32 %p125, %r432, 3;
- mov.u32 %r432, %r125;
+ shr.u32 %r125, %r430, 1;
+ setp.gt.u32 %p125, %r430, 3;
+ mov.u32 %r430, %r125;
@%p125 bra $L__BB0_190;
$L__BB0_193:
- setp.ne.s32 %p126, %r5, 0;
- mov.u32 %r433, 0;
+ setp.ne.s32 %p126, %r6, 0;
+ mov.u32 %r431, 0;
@%p126 bra $L__BB0_197;
- setp.lt.u32 %p127, %r2, 2;
+ setp.lt.u32 %p127, %r3, 2;
ld.shared.f32 %f334, [%rd23];
add.f32 %f388, %f334, 0f00000000;
@%p127 bra $L__BB0_196;
ld.shared.f32 %f335, [%rd31];
add.f32 %f388, %f388, %f335;
$L__BB0_196:
- mov.b32 %r433, %f388;
+ mov.b32 %r431, %f388;
$L__BB0_197:
bar.sync 0;
st.shared.f32 [%rd23], %f386;
bar.sync 0;
@@ -1185,117 +1183,117 @@
$L__BB0_199:
bar.sync 0;
@%p123 bra $L__BB0_204;
- mov.u32 %r434, %r84;
+ mov.u32 %r432, %r84;
$L__BB0_201:
- setp.ge.u32 %p130, %r5, %r434;
+ setp.ge.u32 %p130, %r6, %r432;
@%p130 bra $L__BB0_203;
- add.s32 %r385, %r434, %r39;
- mul.wide.s32 %rd149, %r385, 4;
+ add.s32 %r384, %r432, %r39;
+ mul.wide.s32 %rd149, %r384, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f339, [%rd23];
ld.shared.f32 %f340, [%rd151];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd23], %f341;
$L__BB0_203:
bar.sync 0;
- shr.u32 %r129, %r434, 1;
- setp.gt.u32 %p131, %r434, 3;
- mov.u32 %r434, %r129;
+ shr.u32 %r129, %r432, 1;
+ setp.gt.u32 %p131, %r432, 3;
+ mov.u32 %r432, %r129;
@%p131 bra $L__BB0_201;
$L__BB0_204:
- mov.u32 %r435, 0;
+ mov.u32 %r433, 0;
@%p126 bra $L__BB0_208;
- setp.lt.u32 %p133, %r2, 2;
+ setp.lt.u32 %p133, %r3, 2;
ld.shared.f32 %f342, [%rd23];
add.f32 %f389, %f342, 0f00000000;
@%p133 bra $L__BB0_207;
ld.shared.f32 %f343, [%rd31];
add.f32 %f389, %f389, %f343;
$L__BB0_207:
- mov.b32 %r435, %f389;
+ mov.b32 %r433, %f389;
$L__BB0_208:
bar.sync 0;
@%p126 bra $L__BB0_211;
- mul.lo.s32 %r132, %r81, %r418;
- add.s32 %r387, %r80, %r132;
- setp.ge.s32 %p135, %r387, %r171;
+ mul.lo.s32 %r132, %r81, %r416;
+ add.s32 %r386, %r80, %r132;
+ setp.ge.s32 %p135, %r386, %r171;
@%p135 bra $L__BB0_211;
ld.param.u64 %rd155, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r390, %r82, %r132;
- mul.wide.s32 %rd153, %r390, 4;
+ add.s32 %r389, %r82, %r132;
+ mul.wide.s32 %rd153, %r389, 4;
add.s64 %rd152, %rd155, %rd153;
- st.global.cs.v2.s32 [%rd152], {%r433,%r435};
+ st.global.cs.v2.s32 [%rd152], {%r431,%r433};
bra.uni $L__BB0_211;
$L__BB0_145:
setp.lt.s32 %p98, %r77, 1;
@%p98 bra $L__BB0_179;
- div.s32 %r86, %r78, %r2;
- mad.lo.s32 %r87, %r171, %r5, %r79;
+ div.s32 %r86, %r78, %r3;
+ mad.lo.s32 %r87, %r171, %r6, %r79;
shl.b32 %r88, %r74, 1;
- shl.b32 %r89, %r8, 1;
- mul.lo.s32 %r90, %r171, %r2;
- mov.u32 %r419, 0;
+ shl.b32 %r89, %r9, 1;
+ mul.lo.s32 %r90, %r171, %r3;
+ mov.u32 %r417, 0;
$L__BB0_147:
.pragma "nounroll";
setp.lt.s32 %p99, %r86, 1;
mov.f32 %f380, 0f00000000;
mov.f32 %f381, %f380;
@%p99 bra $L__BB0_153;
- mad.lo.s32 %r92, %r81, %r419, %r80;
- mad.lo.s32 %r359, %r89, %r419, %r88;
- mad.lo.s32 %r421, %r3, %r359, %r87;
- mov.u32 %r358, 0;
+ mad.lo.s32 %r92, %r81, %r417, %r80;
+ mad.lo.s32 %r358, %r89, %r417, %r88;
+ mad.lo.s32 %r419, %r4, %r358, %r87;
+ mov.u32 %r357, 0;
mov.f32 %f380, 0f00000000;
- mov.u32 %r420, %r5;
- mov.u32 %r422, %r358;
+ mov.u32 %r418, %r6;
+ mov.u32 %r420, %r357;
$L__BB0_149:
.pragma "nounroll";
setp.ge.s32 %p100, %r92, %r171;
- mov.u32 %r423, %r358;
- mov.u32 %r424, %r358;
+ mov.u32 %r421, %r357;
+ mov.u32 %r422, %r357;
@%p100 bra $L__BB0_152;
- setp.ge.s32 %p101, %r420, %r8;
- mov.u32 %r423, %r358;
- mov.u32 %r424, %r358;
+ setp.ge.s32 %p101, %r418, %r9;
+ mov.u32 %r421, %r357;
+ mov.u32 %r422, %r357;
@%p101 bra $L__BB0_152;
- mul.wide.s32 %rd135, %r421, 4;
+ mul.wide.s32 %rd135, %r419, 4;
add.s64 %rd134, %rd42, %rd135;
- ld.volatile.global.v2.s32 {%r424,%r423}, [%rd134];
+ ld.volatile.global.v2.s32 {%r422,%r421}, [%rd134];
$L__BB0_152:
- mov.b32 %f304, %r424;
+ mov.b32 %f304, %r422;
add.f32 %f381, %f381, %f304;
- mov.b32 %f305, %r423;
+ mov.b32 %f305, %r421;
add.f32 %f380, %f380, %f305;
- add.s32 %r421, %r421, %r90;
- add.s32 %r420, %r420, %r2;
- add.s32 %r422, %r422, 1;
- setp.lt.s32 %p102, %r422, %r86;
+ add.s32 %r419, %r419, %r90;
+ add.s32 %r418, %r418, %r3;
+ add.s32 %r420, %r420, 1;
+ setp.lt.s32 %p102, %r420, %r86;
@%p102 bra $L__BB0_149;
$L__BB0_153:
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@@ -1309,46 +1307,46 @@
$L__BB0_155:
setp.lt.s32 %p104, %r83, 4;
bar.sync 0;
@%p104 bra $L__BB0_160;
- mov.u32 %r425, %r84;
+ mov.u32 %r423, %r84;
$L__BB0_157:
- setp.ge.u32 %p105, %r5, %r425;
+ setp.ge.u32 %p105, %r6, %r423;
@%p105 bra $L__BB0_159;
- add.s32 %r366, %r425, %r39;
- mul.wide.s32 %rd136, %r366, 4;
+ add.s32 %r365, %r423, %r39;
+ mul.wide.s32 %rd136, %r365, 4;
add.s64 %rd138, %rd45, %rd136;
ld.shared.f32 %f309, [%rd23];
ld.shared.f32 %f310, [%rd138];
add.f32 %f311, %f310, %f309;
st.shared.f32 [%rd23], %f311;
$L__BB0_159:
bar.sync 0;
- shr.u32 %r105, %r425, 1;
- setp.gt.u32 %p106, %r425, 3;
- mov.u32 %r425, %r105;
+ shr.u32 %r105, %r423, 1;
+ setp.gt.u32 %p106, %r423, 3;
+ mov.u32 %r423, %r105;
@%p106 bra $L__BB0_157;
$L__BB0_160:
- setp.ne.s32 %p107, %r5, 0;
- mov.u32 %r426, 0;
+ setp.ne.s32 %p107, %r6, 0;
+ mov.u32 %r424, 0;
@%p107 bra $L__BB0_164;
- setp.lt.u32 %p108, %r2, 2;
+ setp.lt.u32 %p108, %r3, 2;
ld.shared.f32 %f312, [%rd23];
add.f32 %f382, %f312, 0f00000000;
@%p108 bra $L__BB0_163;
ld.shared.f32 %f313, [%rd31];
add.f32 %f382, %f382, %f313;
$L__BB0_163:
- mov.b32 %r426, %f382;
+ mov.b32 %r424, %f382;
$L__BB0_164:
bar.sync 0;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
@@ -1361,66 +1359,66 @@
$L__BB0_166:
bar.sync 0;
@%p104 bra $L__BB0_171;
- mov.u32 %r427, %r84;
+ mov.u32 %r425, %r84;
$L__BB0_168:
- setp.ge.u32 %p111, %r5, %r427;
+ setp.ge.u32 %p111, %r6, %r425;
@%p111 bra $L__BB0_170;
- add.s32 %r368, %r427, %r39;
- mul.wide.s32 %rd139, %r368, 4;
+ add.s32 %r367, %r425, %r39;
+ mul.wide.s32 %rd139, %r367, 4;
add.s64 %rd141, %rd45, %rd139;
ld.shared.f32 %f317, [%rd23];
ld.shared.f32 %f318, [%rd141];
add.f32 %f319, %f318, %f317;
st.shared.f32 [%rd23], %f319;
$L__BB0_170:
bar.sync 0;
- shr.u32 %r109, %r427, 1;
- setp.gt.u32 %p112, %r427, 3;
- mov.u32 %r427, %r109;
+ shr.u32 %r109, %r425, 1;
+ setp.gt.u32 %p112, %r425, 3;
+ mov.u32 %r425, %r109;
@%p112 bra $L__BB0_168;
$L__BB0_171:
- mov.u32 %r428, 0;
+ mov.u32 %r426, 0;
@%p107 bra $L__BB0_175;
- setp.lt.u32 %p114, %r2, 2;
+ setp.lt.u32 %p114, %r3, 2;
ld.shared.f32 %f320, [%rd23];
add.f32 %f383, %f320, 0f00000000;
@%p114 bra $L__BB0_174;
ld.shared.f32 %f321, [%rd31];
add.f32 %f383, %f383, %f321;
$L__BB0_174:
- mov.b32 %r428, %f383;
+ mov.b32 %r426, %f383;
$L__BB0_175:
bar.sync 0;
@%p107 bra $L__BB0_178;
- mul.lo.s32 %r112, %r81, %r419;
- add.s32 %r370, %r80, %r112;
- setp.ge.s32 %p116, %r370, %r171;
+ mul.lo.s32 %r112, %r81, %r417;
+ add.s32 %r369, %r80, %r112;
+ setp.ge.s32 %p116, %r369, %r171;
@%p116 bra $L__BB0_178;
ld.param.u64 %rd154, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r373, %r82, %r112;
- mul.wide.s32 %rd143, %r373, 4;
+ add.s32 %r372, %r82, %r112;
+ mul.wide.s32 %rd143, %r372, 4;
add.s64 %rd142, %rd154, %rd143;
- st.global.cs.v2.s32 [%rd142], {%r426,%r428};
+ st.global.cs.v2.s32 [%rd142], {%r424,%r426};
$L__BB0_178:
- add.s32 %r419, %r419, 1;
- setp.lt.s32 %p117, %r419, %r77;
+ add.s32 %r417, %r417, 1;
+ setp.lt.s32 %p117, %r417, %r77;
@%p117 bra $L__BB0_147;
$L__BB0_179:
ret;
10: CombinedSchedulerTest.LayerNormBackward/dtype_float_batch_216_hidden_768
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 47→ 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<184>;
.reg .f32 %f<480>;
.reg .b32 %r<597>;
.reg .f64 %fd<3>;
.reg .b64 %rd<137>;
ld.param.v2.u32 {%r154, %r155}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r164, %r165}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r168, %r169}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r190, %r155, 3;
shr.s32 %r191, %r190, 31;
shr.u32 %r192, %r191, 30;
add.s32 %r193, %r190, %r192;
shr.s32 %r194, %r193, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r195, %r194, %r2;
add.s32 %r196, %r195, 31;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 27;
add.s32 %r199, %r196, %r198;
shr.u32 %r200, %r199, 5;
mov.u32 %r3, %ntid.y;
mul.lo.s32 %r201, %r3, %r200;
shl.b32 %r202, %r201, 7;
cvt.u64.u32 %rd1, %r202;
mul.lo.s32 %r203, %r3, %r194;
shl.b32 %r204, %r203, 4;
or.b32 %r205, %r204, 15;
and.b32 %r4, %r205, -16;
add.s32 %r206, %r205, %r4;
and.b32 %r207, %r206, -16;
cvt.s64.s32 %rd2, %r207;
mov.u64 %rd44, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p6, %r5, %r194;
shl.b32 %r6, %r5, 2;
or.b32 %r208, %r6, 3;
setp.lt.s32 %p7, %r208, %r155;
and.pred %p1, %p7, %p6;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p8, %r7, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r209, smem_ptr; }
// end inline asm
shl.b32 %r212, %r5, 4;
add.s32 %r210, %r209, %r212;
mul.wide.s32 %rd48, %r6, 4;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r211, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r211, 0;
cp.async.ca.shared.global [%r210], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r213, %r3, 215;
div.s32 %r214, %r213, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r215, %r8, %r214;
add.s32 %r216, %r215, -1;
div.s32 %r9, %r216, %r8;
setp.gt.s32 %p10, %r9, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r155;
cvt.s64.s32 %rd49, %r4;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r218, %ctaid.y;
mul.lo.s32 %r219, %r9, %r3;
mul.lo.s32 %r10, %r219, %r218;
shl.b32 %r220, %r7, 2;
shl.b32 %r221, %r5, 4;
mad.lo.s32 %r11, %r220, %r155, %r221;
mul.lo.s32 %r222, %r155, %r7;
cvt.s64.s32 %rd53, %r222;
cvt.s64.s32 %rd54, %r6;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r223, %r10, %r155;
cvt.s64.s32 %rd6, %r223;
mul.lo.s32 %r12, %r155, %r3;
mul.lo.s32 %r13, %r9, %r218;
add.s32 %r14, %r222, %r6;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r14, 4;
add.s64 %rd7, %rd55, %rd56;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r224, %tid.z;
mad.lo.s32 %r225, %r224, %r3, %r7;
shr.u32 %r15, %r2, 5;
mul.lo.s32 %r226, %r225, %r15;
shr.u32 %r16, %r5, 5;
add.s32 %r227, %r226, %r16;
mul.wide.u32 %rd57, %r227, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r5, 31;
add.s32 %r228, %r226, %r17;
mul.wide.u32 %rd58, %r228, 4;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r6, 4;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r225, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
mov.u32 %r565, 0;
mov.f32 %f438, 0f00000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r231, smem_ptr; }
// end inline asm
add.s32 %r232, %r11, %r231;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
add.s32 %r258, %r11, %r257;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r229, %r565, %r3, %r7;
add.s32 %r230, %r229, %r10;
setp.gt.s32 %p12, %r230, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r234, %r12, %r565;
cvt.s64.s32 %rd65, %r234;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r233, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r233, 0;
cp.async.ca.shared.global [%r232], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r235, %r13, %r565;
mad.lo.s32 %r236, %r235, %r3, %r7;
setp.lt.s32 %p14, %r236, 216;
@%p14 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r245, %r13, %r565;
mad.lo.s32 %r246, %r245, %r3, %r7;
setp.gt.s32 %p15, %r246, 215;
mov.u32 %r566, 0;
mov.u32 %r567, %r566;
mov.u32 %r568, %r566;
mov.u32 %r569, %r566;
@%p15 bra $L__BB0_15;
ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r566, 0;
mov.u32 %r567, %r566;
mov.u32 %r568, %r566;
mov.u32 %r569, %r566;
$L__BB0_15:
add.s32 %r255, %r13, %r565;
mad.lo.s32 %r33, %r255, %r3, %r7;
mov.b32 %f125, %r569;
add.f32 %f455, %f455, %f125;
mov.b32 %f126, %r568;
add.f32 %f454, %f454, %f126;
mov.b32 %f127, %r567;
add.f32 %f453, %f453, %f127;
mov.b32 %f128, %r566;
add.f32 %f452, %f452, %f128;
setp.gt.s32 %p16, %r33, 215;
mov.f32 %f436, 0f00000000;
@%p16 bra $L__BB0_17;
mul.lo.s32 %r256, %r33, %r164;
mul.wide.s32 %rd69, %r256, 4;
add.s64 %rd70, %rd15, %rd69;
ld.global.f32 %f436, [%rd70];
$L__BB0_17:
setp.lt.s32 %p17, %r33, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_19;
mul.lo.s32 %r260, %r12, %r565;
cvt.s64.s32 %rd73, %r260;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
mov.f32 %f442, 0f00000000;
mov.f32 %f437, %f442;
@%p16 bra $L__BB0_21;
mul.lo.s32 %r261, %r33, %r168;
mul.wide.s32 %rd77, %r261, 4;
add.s64 %rd78, %rd16, %rd77;
ld.global.f32 %f437, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f443, %f442;
@%p18 bra $L__BB0_23;
ld.shared.v4.f32 {%f132, %f133, %f134, %f135}, [%rd11];
ld.shared.v4.f32 {%f137, %f138, %f139, %f140}, [%rd7];
mul.f32 %f142, %f132, %f137;
add.f32 %f143, %f142, 0f00000000;
ld.shared.v4.f32 {%f144, %f145, %f146, %f147}, [%rd9];
sub.f32 %f149, %f144, %f436;
mul.f32 %f150, %f437, %f149;
fma.rn.f32 %f151, %f142, %f150, 0f00000000;
fma.rn.f32 %f438, %f150, %f137, %f438;
mul.f32 %f154, %f133, %f138;
add.f32 %f155, %f143, %f154;
sub.f32 %f157, %f145, %f436;
mul.f32 %f158, %f437, %f157;
fma.rn.f32 %f159, %f154, %f158, %f151;
fma.rn.f32 %f439, %f158, %f138, %f439;
mul.f32 %f162, %f134, %f139;
add.f32 %f163, %f155, %f162;
sub.f32 %f165, %f146, %f436;
mul.f32 %f166, %f437, %f165;
fma.rn.f32 %f167, %f162, %f166, %f159;
fma.rn.f32 %f440, %f166, %f139, %f440;
mul.f32 %f170, %f135, %f140;
add.f32 %f443, %f163, %f170;
sub.f32 %f172, %f147, %f436;
mul.f32 %f173, %f437, %f172;
fma.rn.f32 %f442, %f170, %f173, %f167;
fma.rn.f32 %f441, %f173, %f140, %f441;
$L__BB0_23:
mov.b32 %r262, %f443;
mov.u32 %r263, 31;
mov.u32 %r264, 16;
mov.u32 %r265, -1;
shfl.sync.bfly.b32 %r266|%p21, %r262, %r264, %r263, %r265;
mov.b32 %f174, %r266;
add.f32 %f175, %f443, %f174;
mov.b32 %r267, %f175;
mov.u32 %r268, 8;
shfl.sync.bfly.b32 %r269|%p22, %r267, %r268, %r263, %r265;
mov.b32 %f176, %r269;
add.f32 %f177, %f175, %f176;
mov.b32 %r270, %f177;
mov.u32 %r271, 4;
shfl.sync.bfly.b32 %r272|%p23, %r270, %r271, %r263, %r265;
mov.b32 %f178, %r272;
add.f32 %f179, %f177, %f178;
mov.b32 %r273, %f179;
mov.u32 %r274, 2;
shfl.sync.bfly.b32 %r275|%p24, %r273, %r274, %r263, %r265;
mov.b32 %f180, %r275;
add.f32 %f181, %f179, %f180;
mov.b32 %r276, %f181;
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r278|%p25, %r276, %r277, %r263, %r265;
mov.b32 %f182, %r278;
add.f32 %f445, %f181, %f182;
bar.sync 0;
setp.ne.s32 %p26, %r17, 0;
@%p26 bra $L__BB0_25;
st.shared.f32 [%rd8], %f445;
$L__BB0_25:
setp.ne.s32 %p27, %r16, 0;
bar.sync 0;
@%p27 bra $L__BB0_29;
setp.ge.u32 %p28, %r17, %r15;
mov.f32 %f444, 0f00000000;
@%p28 bra $L__BB0_28;
ld.shared.f32 %f444, [%rd10];
$L__BB0_28:
mov.b32 %r279, %f444;
mov.u32 %r280, 31;
mov.u32 %r281, 16;
mov.u32 %r282, -1;
shfl.sync.bfly.b32 %r283|%p29, %r279, %r281, %r280, %r282;
mov.b32 %f184, %r283;
add.f32 %f185, %f444, %f184;
mov.b32 %r284, %f185;
mov.u32 %r285, 8;
shfl.sync.bfly.b32 %r286|%p30, %r284, %r285, %r280, %r282;
mov.b32 %f186, %r286;
add.f32 %f187, %f185, %f186;
mov.b32 %r287, %f187;
mov.u32 %r288, 4;
shfl.sync.bfly.b32 %r289|%p31, %r287, %r288, %r280, %r282;
mov.b32 %f188, %r289;
add.f32 %f189, %f187, %f188;
mov.b32 %r290, %f189;
mov.u32 %r291, 2;
shfl.sync.bfly.b32 %r292|%p32, %r290, %r291, %r280, %r282;
mov.b32 %f190, %r292;
add.f32 %f191, %f189, %f190;
mov.b32 %r293, %f191;
mov.u32 %r294, 1;
shfl.sync.bfly.b32 %r295|%p33, %r293, %r294, %r280, %r282;
mov.b32 %f192, %r295;
add.f32 %f445, %f191, %f192;
$L__BB0_29:
setp.ne.s32 %p180, %r17, 0;
bar.sync 0;
mov.b32 %r296, %f442;
mov.u32 %r297, 31;
mov.u32 %r298, 16;
mov.u32 %r299, -1;
shfl.sync.bfly.b32 %r300|%p34, %r296, %r298, %r297, %r299;
mov.b32 %f193, %r300;
add.f32 %f194, %f442, %f193;
mov.b32 %r301, %f194;
mov.u32 %r302, 8;
shfl.sync.bfly.b32 %r303|%p35, %r301, %r302, %r297, %r299;
mov.b32 %f195, %r303;
add.f32 %f196, %f194, %f195;
mov.b32 %r304, %f196;
mov.u32 %r305, 4;
shfl.sync.bfly.b32 %r306|%p36, %r304, %r305, %r297, %r299;
mov.b32 %f197, %r306;
add.f32 %f198, %f196, %f197;
mov.b32 %r307, %f198;
mov.u32 %r308, 2;
shfl.sync.bfly.b32 %r309|%p37, %r307, %r308, %r297, %r299;
mov.b32 %f199, %r309;
add.f32 %f200, %f198, %f199;
mov.b32 %r310, %f200;
mov.u32 %r311, 1;
shfl.sync.bfly.b32 %r312|%p38, %r310, %r311, %r297, %r299;
mov.b32 %f201, %r312;
add.f32 %f447, %f200, %f201;
bar.sync 0;
@%p180 bra $L__BB0_31;
st.shared.f32 [%rd8], %f447;
$L__BB0_31:
setp.eq.s32 %p183, %r17, 0;
setp.ne.s32 %p181, %r16, 0;
bar.sync 0;
add.f32 %f202, %f445, 0f00000000;
selp.f32 %f37, %f202, 0f00000000, %p183;
@%p181 bra $L__BB0_35;
setp.ge.u32 %p41, %r17, %r15;
mov.f32 %f446, 0f00000000;
@%p41 bra $L__BB0_34;
ld.shared.f32 %f446, [%rd10];
$L__BB0_34:
mov.b32 %r313, %f446;
mov.u32 %r314, 31;
mov.u32 %r315, 16;
mov.u32 %r316, -1;
shfl.sync.bfly.b32 %r317|%p42, %r313, %r315, %r314, %r316;
mov.b32 %f204, %r317;
add.f32 %f205, %f446, %f204;
mov.b32 %r318, %f205;
mov.u32 %r319, 8;
shfl.sync.bfly.b32 %r320|%p43, %r318, %r319, %r314, %r316;
mov.b32 %f206, %r320;
add.f32 %f207, %f205, %f206;
mov.b32 %r321, %f207;
mov.u32 %r322, 4;
shfl.sync.bfly.b32 %r323|%p44, %r321, %r322, %r314, %r316;
mov.b32 %f208, %r323;
add.f32 %f209, %f207, %f208;
mov.b32 %r324, %f209;
mov.u32 %r325, 2;
shfl.sync.bfly.b32 %r326|%p45, %r324, %r325, %r314, %r316;
mov.b32 %f210, %r326;
add.f32 %f211, %f209, %f210;
mov.b32 %r327, %f211;
mov.u32 %r328, 1;
shfl.sync.bfly.b32 %r329|%p46, %r327, %r328, %r314, %r316;
mov.b32 %f212, %r329;
add.f32 %f447, %f211, %f212;
$L__BB0_35:
bar.sync 0;
setp.ne.s32 %p47, %r5, 0;
@%p47 bra $L__BB0_37;
st.shared.f32 [%rd12], %f37;
$L__BB0_37:
bar.sync 0;
ld.shared.f32 %f42, [%rd12];
bar.sync 0;
@%p47 bra $L__BB0_39;
setp.eq.s32 %p182, %r17, 0;
add.f32 %f213, %f447, 0f00000000;
selp.f32 %f214, %f213, 0f00000000, %p182;
st.shared.f32 [%rd12], %f214;
$L__BB0_39:
bar.sync 0;
ld.shared.f32 %f43, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_41;
mul.f32 %f215, %f437, %f1;
ld.shared.v4.f32 {%f216, %f217, %f218, %f219}, [%rd11];
ld.shared.v4.f32 {%f221, %f222, %f223, %f224}, [%rd7];
mul.f32 %f226, %f216, %f221;
mul.f32 %f227, %f226, %f2;
ld.shared.v4.f32 {%f228, %f229, %f230, %f231}, [%rd9];
sub.f32 %f233, %f228, %f436;
mul.f32 %f234, %f437, %f233;
sub.f32 %f235, %f227, %f42;
mul.f32 %f236, %f43, %f234;
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f215, %f237;
mov.b32 %r330, %f238;
mul.f32 %f241, %f217, %f222;
mul.f32 %f242, %f241, %f2;
sub.f32 %f244, %f229, %f436;
mul.f32 %f245, %f437, %f244;
sub.f32 %f246, %f242, %f42;
mul.f32 %f247, %f43, %f245;
sub.f32 %f248, %f246, %f247;
mul.f32 %f249, %f215, %f248;
mov.b32 %r331, %f249;
mul.f32 %f252, %f218, %f223;
mul.f32 %f253, %f252, %f2;
sub.f32 %f255, %f230, %f436;
mul.f32 %f256, %f437, %f255;
sub.f32 %f257, %f253, %f42;
mul.f32 %f258, %f43, %f256;
sub.f32 %f259, %f257, %f258;
mul.f32 %f260, %f215, %f259;
mov.b32 %r332, %f260;
mul.f32 %f263, %f219, %f224;
mul.f32 %f264, %f263, %f2;
sub.f32 %f266, %f231, %f436;
mul.f32 %f267, %f437, %f266;
sub.f32 %f268, %f264, %f42;
mul.f32 %f269, %f43, %f267;
sub.f32 %f270, %f268, %f269;
mul.f32 %f271, %f215, %f270;
mov.b32 %r333, %f271;
mad.lo.s32 %r334, %r565, %r3, %r10;
mad.lo.s32 %r335, %r334, %r155, %r14;
mul.wide.s32 %rd80, %r335, 4;
add.s64 %rd79, %rd38, %rd80;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_41:
add.s32 %r565, %r565, 1;
setp.lt.s32 %p51, %r565, %r9;
@%p51 bra $L__BB0_5;
bra.uni $L__BB0_42;
$L__BB0_3:
mov.f32 %f438, 0f00000000;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_42:
mov.u32 %r336, %tid.z;
mad.lo.s32 %r35, %r336, %r3, %r7;
mad.lo.s32 %r36, %r35, %r2, %r5;
mul.wide.u32 %rd81, %r36, 4;
add.s64 %rd22, %rd44, %rd81;
clz.b32 %r337, %r3;
mov.u32 %r338, 31;
sub.s32 %r339, %r338, %r337;
mov.u32 %r340, 1;
shl.b32 %r37, %r340, %r339;
setp.lt.u32 %p52, %r7, %r37;
add.s32 %r341, %r37, %r7;
setp.lt.u32 %p53, %r341, %r3;
and.pred %p5, %p52, %p53;
shl.b32 %r342, %r2, %r339;
add.s32 %r343, %r36, %r342;
mul.wide.s32 %rd83, %r343, 4;
add.s64 %rd23, %rd44, %rd83;
shr.u32 %r344, %r37, 31;
add.s32 %r345, %r37, %r344;
shr.s32 %r584, %r345, 1;
st.shared.f32 [%rd22], %f438;
bar.sync 0;
not.pred %p54, %p5;
@%p54 bra $L__BB0_44;
ld.shared.f32 %f272, [%rd23];
ld.shared.f32 %f273, [%rd22];
add.f32 %f274, %f272, %f273;
st.shared.f32 [%rd22], %f274;
$L__BB0_44:
setp.lt.s32 %p55, %r37, 4;
bar.sync 0;
@%p55 bra $L__BB0_49;
mov.u32 %r570, %r584;
$L__BB0_46:
setp.ge.u32 %p56, %r7, %r570;
@%p56 bra $L__BB0_48;
mad.lo.s32 %r346, %r570, %r2, %r36;
mul.wide.s32 %rd84, %r346, 4;
add.s64 %rd86, %rd44, %rd84;
ld.shared.f32 %f275, [%rd22];
ld.shared.f32 %f276, [%rd86];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd22], %f277;
$L__BB0_48:
bar.sync 0;
shr.u32 %r40, %r570, 1;
setp.gt.u32 %p57, %r570, 3;
mov.u32 %r570, %r40;
@%p57 bra $L__BB0_46;
$L__BB0_49:
add.s32 %r348, %r36, %r2;
mul.wide.u32 %rd87, %r348, 4;
add.s64 %rd24, %rd44, %rd87;
setp.ne.s32 %p58, %r7, 0;
mov.u32 %r571, 0;
@%p58 bra $L__BB0_53;
setp.lt.u32 %p59, %r3, 2;
ld.shared.f32 %f278, [%rd22];
add.f32 %f456, %f278, 0f00000000;
@%p59 bra $L__BB0_52;
ld.shared.f32 %f279, [%rd24];
add.f32 %f456, %f456, %f279;
$L__BB0_52:
mov.b32 %r571, %f456;
$L__BB0_53:
bar.sync 0;
st.shared.f32 [%rd22], %f439;
bar.sync 0;
@%p54 bra $L__BB0_55;
ld.shared.f32 %f280, [%rd23];
ld.shared.f32 %f281, [%rd22];
add.f32 %f282, %f280, %f281;
st.shared.f32 [%rd22], %f282;
$L__BB0_55:
bar.sync 0;
@%p55 bra $L__BB0_60;
mov.u32 %r572, %r584;
$L__BB0_57:
setp.ge.u32 %p62, %r7, %r572;
@%p62 bra $L__BB0_59;
mad.lo.s32 %r349, %r572, %r2, %r36;
mul.wide.s32 %rd89, %r349, 4;
add.s64 %rd91, %rd44, %rd89;
ld.shared.f32 %f283, [%rd22];
ld.shared.f32 %f284, [%rd91];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd22], %f285;
$L__BB0_59:
bar.sync 0;
shr.u32 %r44, %r572, 1;
setp.gt.u32 %p63, %r572, 3;
mov.u32 %r572, %r44;
@%p63 bra $L__BB0_57;
$L__BB0_60:
mov.u32 %r573, 0;
@%p58 bra $L__BB0_64;
setp.lt.u32 %p65, %r3, 2;
ld.shared.f32 %f286, [%rd22];
add.f32 %f457, %f286, 0f00000000;
@%p65 bra $L__BB0_63;
ld.shared.f32 %f287, [%rd24];
add.f32 %f457, %f457, %f287;
$L__BB0_63:
mov.b32 %r573, %f457;
$L__BB0_64:
bar.sync 0;
st.shared.f32 [%rd22], %f440;
bar.sync 0;
@%p54 bra $L__BB0_66;
ld.shared.f32 %f288, [%rd23];
ld.shared.f32 %f289, [%rd22];
add.f32 %f290, %f288, %f289;
st.shared.f32 [%rd22], %f290;
$L__BB0_66:
bar.sync 0;
@%p55 bra $L__BB0_71;
mov.u32 %r574, %r584;
$L__BB0_68:
setp.ge.u32 %p68, %r7, %r574;
@%p68 bra $L__BB0_70;
mad.lo.s32 %r351, %r574, %r2, %r36;
mul.wide.s32 %rd92, %r351, 4;
add.s64 %rd94, %rd44, %rd92;
ld.shared.f32 %f291, [%rd22];
ld.shared.f32 %f292, [%rd94];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd22], %f293;
$L__BB0_70:
bar.sync 0;
shr.u32 %r48, %r574, 1;
setp.gt.u32 %p69, %r574, 3;
mov.u32 %r574, %r48;
@%p69 bra $L__BB0_68;
$L__BB0_71:
mov.u32 %r575, 0;
@%p58 bra $L__BB0_75;
setp.lt.u32 %p71, %r3, 2;
ld.shared.f32 %f294, [%rd22];
add.f32 %f458, %f294, 0f00000000;
@%p71 bra $L__BB0_74;
ld.shared.f32 %f295, [%rd24];
add.f32 %f458, %f458, %f295;
$L__BB0_74:
mov.b32 %r575, %f458;
$L__BB0_75:
bar.sync 0;
st.shared.f32 [%rd22], %f441;
bar.sync 0;
@%p54 bra $L__BB0_77;
ld.shared.f32 %f296, [%rd23];
ld.shared.f32 %f297, [%rd22];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd22], %f298;
$L__BB0_77:
bar.sync 0;
@%p55 bra $L__BB0_82;
mov.u32 %r576, %r584;
$L__BB0_79:
setp.ge.u32 %p74, %r7, %r576;
@%p74 bra $L__BB0_81;
mad.lo.s32 %r353, %r576, %r2, %r36;
mul.wide.s32 %rd95, %r353, 4;
add.s64 %rd97, %rd44, %rd95;
ld.shared.f32 %f299, [%rd22];
ld.shared.f32 %f300, [%rd97];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd22], %f301;
$L__BB0_81:
bar.sync 0;
shr.u32 %r52, %r576, 1;
setp.gt.u32 %p75, %r576, 3;
mov.u32 %r576, %r52;
@%p75 bra $L__BB0_79;
$L__BB0_82:
mov.u32 %r577, 0;
@%p58 bra $L__BB0_86;
setp.lt.u32 %p77, %r3, 2;
ld.shared.f32 %f302, [%rd22];
add.f32 %f459, %f302, 0f00000000;
@%p77 bra $L__BB0_85;
ld.shared.f32 %f303, [%rd24];
add.f32 %f459, %f459, %f303;
$L__BB0_85:
mov.b32 %r577, %f459;
$L__BB0_86:
bar.sync 0;
st.shared.f32 [%rd22], %f452;
bar.sync 0;
@%p54 bra $L__BB0_88;
ld.shared.f32 %f304, [%rd23];
ld.shared.f32 %f305, [%rd22];
add.f32 %f306, %f304, %f305;
st.shared.f32 [%rd22], %f306;
$L__BB0_88:
bar.sync 0;
@%p55 bra $L__BB0_93;
mov.u32 %r578, %r584;
$L__BB0_90:
setp.ge.u32 %p80, %r7, %r578;
@%p80 bra $L__BB0_92;
mad.lo.s32 %r355, %r578, %r2, %r36;
mul.wide.s32 %rd98, %r355, 4;
add.s64 %rd100, %rd44, %rd98;
ld.shared.f32 %f307, [%rd22];
ld.shared.f32 %f308, [%rd100];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd22], %f309;
$L__BB0_92:
bar.sync 0;
shr.u32 %r56, %r578, 1;
setp.gt.u32 %p81, %r578, 3;
mov.u32 %r578, %r56;
@%p81 bra $L__BB0_90;
$L__BB0_93:
mov.u32 %r579, 0;
@%p58 bra $L__BB0_97;
setp.lt.u32 %p83, %r3, 2;
ld.shared.f32 %f310, [%rd22];
add.f32 %f460, %f310, 0f00000000;
@%p83 bra $L__BB0_96;
ld.shared.f32 %f311, [%rd24];
add.f32 %f460, %f460, %f311;
$L__BB0_96:
mov.b32 %r579, %f460;
$L__BB0_97:
bar.sync 0;
st.shared.f32 [%rd22], %f453;
bar.sync 0;
@%p54 bra $L__BB0_99;
ld.shared.f32 %f312, [%rd23];
ld.shared.f32 %f313, [%rd22];
add.f32 %f314, %f312, %f313;
st.shared.f32 [%rd22], %f314;
$L__BB0_99:
bar.sync 0;
@%p55 bra $L__BB0_104;
mov.u32 %r580, %r584;
$L__BB0_101:
setp.ge.u32 %p86, %r7, %r580;
@%p86 bra $L__BB0_103;
mad.lo.s32 %r357, %r580, %r2, %r36;
mul.wide.s32 %rd101, %r357, 4;
add.s64 %rd103, %rd44, %rd101;
ld.shared.f32 %f315, [%rd22];
ld.shared.f32 %f316, [%rd103];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd22], %f317;
$L__BB0_103:
bar.sync 0;
shr.u32 %r60, %r580, 1;
setp.gt.u32 %p87, %r580, 3;
mov.u32 %r580, %r60;
@%p87 bra $L__BB0_101;
$L__BB0_104:
mov.u32 %r581, 0;
@%p58 bra $L__BB0_108;
setp.lt.u32 %p89, %r3, 2;
ld.shared.f32 %f318, [%rd22];
add.f32 %f461, %f318, 0f00000000;
@%p89 bra $L__BB0_107;
ld.shared.f32 %f319, [%rd24];
add.f32 %f461, %f461, %f319;
$L__BB0_107:
mov.b32 %r581, %f461;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd22], %f454;
bar.sync 0;
@%p54 bra $L__BB0_110;
ld.shared.f32 %f320, [%rd23];
ld.shared.f32 %f321, [%rd22];
add.f32 %f322, %f320, %f321;
st.shared.f32 [%rd22], %f322;
$L__BB0_110:
bar.sync 0;
@%p55 bra $L__BB0_115;
mov.u32 %r582, %r584;
$L__BB0_112:
setp.ge.u32 %p92, %r7, %r582;
@%p92 bra $L__BB0_114;
mad.lo.s32 %r359, %r582, %r2, %r36;
mul.wide.s32 %rd104, %r359, 4;
add.s64 %rd106, %rd44, %rd104;
ld.shared.f32 %f323, [%rd22];
ld.shared.f32 %f324, [%rd106];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd22], %f325;
$L__BB0_114:
bar.sync 0;
shr.u32 %r64, %r582, 1;
setp.gt.u32 %p93, %r582, 3;
mov.u32 %r582, %r64;
@%p93 bra $L__BB0_112;
$L__BB0_115:
mov.u32 %r583, 0;
@%p58 bra $L__BB0_119;
setp.lt.u32 %p95, %r3, 2;
ld.shared.f32 %f326, [%rd22];
add.f32 %f462, %f326, 0f00000000;
@%p95 bra $L__BB0_118;
ld.shared.f32 %f327, [%rd24];
add.f32 %f462, %f462, %f327;
$L__BB0_118:
mov.b32 %r583, %f462;
$L__BB0_119:
bar.sync 0;
st.shared.f32 [%rd22], %f455;
bar.sync 0;
@%p54 bra $L__BB0_121;
ld.shared.f32 %f328, [%rd23];
ld.shared.f32 %f329, [%rd22];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd22], %f330;
$L__BB0_121:
bar.sync 0;
@%p55 bra $L__BB0_125;
$L__BB0_122:
setp.ge.u32 %p98, %r7, %r584;
@%p98 bra $L__BB0_124;
mad.lo.s32 %r361, %r584, %r2, %r36;
mul.wide.s32 %rd107, %r361, 4;
add.s64 %rd109, %rd44, %rd107;
ld.shared.f32 %f331, [%rd22];
ld.shared.f32 %f332, [%rd109];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd22], %f333;
$L__BB0_124:
bar.sync 0;
shr.u32 %r68, %r584, 1;
setp.gt.u32 %p99, %r584, 3;
mov.u32 %r584, %r68;
@%p99 bra $L__BB0_122;
$L__BB0_125:
mov.u32 %r585, 0;
@%p58 bra $L__BB0_129;
setp.lt.u32 %p101, %r3, 2;
ld.shared.f32 %f334, [%rd22];
add.f32 %f463, %f334, 0f00000000;
@%p101 bra $L__BB0_128;
ld.shared.f32 %f335, [%rd24];
add.f32 %f463, %f463, %f335;
$L__BB0_128:
mov.b32 %r585, %f463;
$L__BB0_129:
setp.eq.s32 %p179, %r7, 0;
and.pred %p178, %p179, %p1;
bar.sync 0;
@%p178 bra $L__BB0_130;
bra.uni $L__BB0_131;
$L__BB0_130:
shl.b32 %r564, %r5, 2;
mov.u32 %r371, %ctaid.y;
mad.lo.s32 %r372, %r155, %r371, %r564;
mul.wide.s32 %rd112, %r372, 4;
add.s64 %rd110, %rd41, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd110], {%r571,%r573,%r575,%r577};
// end inline asm
add.s64 %rd111, %rd42, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd111], {%r579,%r581,%r583,%r585};
// end inline asm
$L__BB0_131:
mov.u32 %r71, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r373, %r5, %r7;
or.b32 %r375, %r373, %r336;
setp.ne.s32 %p102, %r375, 0;
@%p102 bra $L__BB0_135;
ld.param.u64 %rd136, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd136;
mov.u32 %r376, %ctaid.x;
mov.u32 %r377, %ctaid.z;
mov.u32 %r378, %nctaid.x;
mad.lo.s32 %r379, %r377, %r378, %r376;
mul.wide.s32 %rd114, %r379, 8;
add.s64 %rd27, %rd113, %rd114;
add.s32 %r380, %r8, -1;
setp.eq.s32 %p103, %r71, %r380;
cvt.s64.s32 %rd115, %r8;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p103;
atom.global.add.u64 %rd28, [%rd27], %rd118;
ld.volatile.global.u64 %rd119, [%rd27];
xor.b64 %rd120, %rd119, %rd28;
setp.lt.s64 %p104, %rd120, 0;
@%p104 bra $L__BB0_135;
mov.u32 %r586, 8;
$L__BB0_134:
// begin inline asm
nanosleep.u32 %r586;
// end inline asm
setp.lt.u32 %p105, %r586, 256;
selp.u32 %r383, 1, 0, %p105;
shl.b32 %r586, %r586, %r383;
ld.volatile.global.u64 %rd121, [%rd27];
xor.b64 %rd122, %rd121, %rd28;
setp.gt.s64 %p106, %rd122, -1;
@%p106 bra $L__BB0_134;
$L__BB0_135:
bar.sync 0;
add.s32 %r385, %r155, 1;
shr.u32 %r386, %r385, 31;
add.s32 %r387, %r385, %r386;
shr.s32 %r388, %r387, 1;
add.s32 %r389, %r3, %r388;
add.s32 %r390, %r389, -1;
div.s32 %r391, %r390, %r3;
add.s32 %r392, %r8, -1;
add.s32 %r393, %r392, %r391;
div.s32 %r74, %r393, %r8;
add.s32 %r75, %r392, %r2;
shl.b32 %r76, %r7, 1;
shl.b32 %r394, %r3, 1;
mad.lo.s32 %r79, %r394, %r71, %r76;
or.b32 %r77, %r79, 1;
mul.lo.s32 %r78, %r394, %r8;
shr.u32 %r80, %r2, 5;
mul.lo.s32 %r395, %r35, %r80;
shr.u32 %r81, %r5, 5;
add.s32 %r396, %r395, %r81;
mul.wide.u32 %rd123, %r396, 4;
add.s64 %rd29, %rd44, %rd123;
and.b32 %r82, %r5, 31;
add.s32 %r397, %r395, %r82;
mul.wide.u32 %rd125, %r397, 4;
add.s64 %rd30, %rd44, %rd125;
mov.u32 %r587, 0;
bra.uni $L__BB0_136;
$L__BB0_183:
add.s32 %r587, %r587, 1;
$L__BB0_136:
.pragma "nounroll";
setp.lt.s32 %p107, %r587, %r74;
@%p107 bra $L__BB0_162;
bra.uni $L__BB0_137;
$L__BB0_162:
div.s32 %r105, %r75, %r2;
setp.lt.s32 %p144, %r105, 1;
mov.f32 %f474, 0f00000000;
mov.f32 %f475, %f474;
@%p144 bra $L__BB0_168;
mul.lo.s32 %r482, %r78, %r587;
add.s32 %r106, %r77, %r482;
add.s32 %r107, %r79, %r482;
mov.u32 %r481, 0;
mov.f32 %f474, 0f00000000;
mov.u32 %r594, %r481;
$L__BB0_164:
.pragma "nounroll";
setp.ge.s32 %p145, %r106, %r155;
mov.u32 %r595, %r481;
mov.u32 %r596, %r481;
@%p145 bra $L__BB0_167;
mad.lo.s32 %r109, %r594, %r2, %r5;
setp.ge.s32 %p146, %r109, %r8;
mov.u32 %r595, %r481;
mov.u32 %r596, %r481;
@%p146 bra $L__BB0_167;
mad.lo.s32 %r489, %r109, %r155, %r107;
mul.wide.s32 %rd131, %r489, 4;
add.s64 %rd130, %rd42, %rd131;
// begin inline asm
ld.volatile.global.v2.s32 {%r596,%r595}, [%rd130];
// end inline asm
$L__BB0_167:
mov.b32 %f386, %r596;
add.f32 %f475, %f475, %f386;
mov.b32 %f387, %r595;
add.f32 %f474, %f474, %f387;
add.s32 %r594, %r594, 1;
setp.lt.s32 %p147, %r594, %r105;
@%p147 bra $L__BB0_164;
$L__BB0_168:
mov.b32 %r490, %f475;
mov.u32 %r491, 31;
mov.u32 %r492, 16;
mov.u32 %r493, -1;
shfl.sync.bfly.b32 %r494|%p148, %r490, %r492, %r491, %r493;
mov.b32 %f388, %r494;
add.f32 %f389, %f475, %f388;
mov.b32 %r495, %f389;
mov.u32 %r496, 8;
shfl.sync.bfly.b32 %r497|%p149, %r495, %r496, %r491, %r493;
mov.b32 %f390, %r497;
add.f32 %f391, %f389, %f390;
mov.b32 %r498, %f391;
mov.u32 %r499, 4;
shfl.sync.bfly.b32 %r500|%p150, %r498, %r499, %r491, %r493;
mov.b32 %f392, %r500;
add.f32 %f393, %f391, %f392;
mov.b32 %r501, %f393;
mov.u32 %r502, 2;
shfl.sync.bfly.b32 %r503|%p151, %r501, %r502, %r491, %r493;
mov.b32 %f394, %r503;
add.f32 %f395, %f393, %f394;
mov.b32 %r504, %f395;
mov.u32 %r505, 1;
shfl.sync.bfly.b32 %r506|%p152, %r504, %r505, %r491, %r493;
mov.b32 %f396, %r506;
add.f32 %f477, %f395, %f396;
bar.sync 0;
setp.ne.s32 %p153, %r82, 0;
@%p153 bra $L__BB0_170;
st.shared.f32 [%rd29], %f477;
$L__BB0_170:
setp.ne.s32 %p154, %r81, 0;
bar.sync 0;
@%p154 bra $L__BB0_174;
setp.ge.u32 %p155, %r82, %r80;
mov.f32 %f476, 0f00000000;
@%p155 bra $L__BB0_173;
ld.shared.f32 %f476, [%rd30];
$L__BB0_173:
mov.b32 %r507, %f476;
mov.u32 %r508, 31;
mov.u32 %r509, 16;
mov.u32 %r510, -1;
shfl.sync.bfly.b32 %r511|%p156, %r507, %r509, %r508, %r510;
mov.b32 %f398, %r511;
add.f32 %f399, %f476, %f398;
mov.b32 %r512, %f399;
mov.u32 %r513, 8;
shfl.sync.bfly.b32 %r514|%p157, %r512, %r513, %r508, %r510;
mov.b32 %f400, %r514;
add.f32 %f401, %f399, %f400;
mov.b32 %r515, %f401;
mov.u32 %r516, 4;
shfl.sync.bfly.b32 %r517|%p158, %r515, %r516, %r508, %r510;
mov.b32 %f402, %r517;
add.f32 %f403, %f401, %f402;
mov.b32 %r518, %f403;
mov.u32 %r519, 2;
shfl.sync.bfly.b32 %r520|%p159, %r518, %r519, %r508, %r510;
mov.b32 %f404, %r520;
add.f32 %f405, %f403, %f404;
mov.b32 %r521, %f405;
mov.u32 %r522, 1;
shfl.sync.bfly.b32 %r523|%p160, %r521, %r522, %r508, %r510;
mov.b32 %f406, %r523;
add.f32 %f477, %f405, %f406;
$L__BB0_174:
add.f32 %f407, %f477, 0f00000000;
mov.b32 %r524, %f407;
setp.eq.s32 %p162, %r82, 0;
selp.b32 %r115, %r524, 0, %p162;
bar.sync 0;
mov.b32 %r525, %f474;
mov.u32 %r526, 31;
mov.u32 %r527, 16;
mov.u32 %r528, -1;
shfl.sync.bfly.b32 %r529|%p163, %r525, %r527, %r526, %r528;
mov.b32 %f408, %r529;
add.f32 %f409, %f474, %f408;
mov.b32 %r530, %f409;
mov.u32 %r531, 8;
shfl.sync.bfly.b32 %r532|%p164, %r530, %r531, %r526, %r528;
mov.b32 %f410, %r532;
add.f32 %f411, %f409, %f410;
mov.b32 %r533, %f411;
mov.u32 %r534, 4;
shfl.sync.bfly.b32 %r535|%p165, %r533, %r534, %r526, %r528;
mov.b32 %f412, %r535;
add.f32 %f413, %f411, %f412;
mov.b32 %r536, %f413;
mov.u32 %r537, 2;
shfl.sync.bfly.b32 %r538|%p166, %r536, %r537, %r526, %r528;
mov.b32 %f414, %r538;
add.f32 %f415, %f413, %f414;
mov.b32 %r539, %f415;
mov.u32 %r540, 1;
shfl.sync.bfly.b32 %r541|%p167, %r539, %r540, %r526, %r528;
mov.b32 %f416, %r541;
add.f32 %f479, %f415, %f416;
bar.sync 0;
@%p153 bra $L__BB0_176;
st.shared.f32 [%rd29], %f479;
$L__BB0_176:
bar.sync 0;
@%p154 bra $L__BB0_180;
setp.ge.u32 %p169, %r82, %r80;
mov.f32 %f478, 0f00000000;
@%p169 bra $L__BB0_179;
ld.shared.f32 %f478, [%rd30];
$L__BB0_179:
mov.b32 %r542, %f478;
mov.u32 %r543, 31;
mov.u32 %r544, 16;
mov.u32 %r545, -1;
shfl.sync.bfly.b32 %r546|%p170, %r542, %r544, %r543, %r545;
mov.b32 %f418, %r546;
add.f32 %f419, %f478, %f418;
mov.b32 %r547, %f419;
mov.u32 %r548, 8;
shfl.sync.bfly.b32 %r549|%p171, %r547, %r548, %r543, %r545;
mov.b32 %f420, %r549;
add.f32 %f421, %f419, %f420;
mov.b32 %r550, %f421;
mov.u32 %r551, 4;
shfl.sync.bfly.b32 %r552|%p172, %r550, %r551, %r543, %r545;
mov.b32 %f422, %r552;
add.f32 %f423, %f421, %f422;
mov.b32 %r553, %f423;
mov.u32 %r554, 2;
shfl.sync.bfly.b32 %r555|%p173, %r553, %r554, %r543, %r545;
mov.b32 %f424, %r555;
add.f32 %f425, %f423, %f424;
mov.b32 %r556, %f425;
mov.u32 %r557, 1;
shfl.sync.bfly.b32 %r558|%p174, %r556, %r557, %r543, %r545;
mov.b32 %f426, %r558;
add.f32 %f479, %f425, %f426;
$L__BB0_180:
bar.sync 0;
setp.ne.s32 %p175, %r5, 0;
@%p175 bra $L__BB0_183;
mul.lo.s32 %r116, %r78, %r587;
add.s32 %r559, %r77, %r116;
setp.ge.s32 %p176, %r559, %r155;
@%p176 bra $L__BB0_183;
ld.param.u64 %rd135, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r562, %r79, %r116;
mul.wide.s32 %rd133, %r562, 4;
add.s64 %rd132, %rd135, %rd133;
add.f32 %f427, %f479, 0f00000000;
mov.b32 %r563, %f427;
selp.b32 %r561, %r563, 0, %p162;
// begin inline asm
st.global.cs.v2.s32 [%rd132], {%r115,%r561};
// end inline asm
bra.uni $L__BB0_183;
$L__BB0_137:
setp.lt.s32 %p108, %r74, 1;
@%p108 bra $L__BB0_161;
div.s32 %r84, %r75, %r2;
mad.lo.s32 %r85, %r155, %r5, %r76;
shl.b32 %r86, %r71, 1;
shl.b32 %r87, %r8, 1;
mul.lo.s32 %r88, %r155, %r2;
mov.u32 %r588, 0;
$L__BB0_139:
.pragma "nounroll";
setp.lt.s32 %p109, %r84, 1;
mov.f32 %f466, 0f00000000;
mov.f32 %f467, %f466;
@%p109 bra $L__BB0_145;
mad.lo.s32 %r90, %r78, %r588, %r77;
mad.lo.s32 %r400, %r87, %r588, %r86;
mad.lo.s32 %r590, %r3, %r400, %r85;
mov.u32 %r399, 0;
mov.f32 %f466, 0f00000000;
mov.u32 %r589, %r5;
mov.u32 %r591, %r399;
$L__BB0_141:
.pragma "nounroll";
setp.ge.s32 %p110, %r90, %r155;
mov.u32 %r592, %r399;
mov.u32 %r593, %r399;
@%p110 bra $L__BB0_144;
setp.ge.s32 %p111, %r589, %r8;
mov.u32 %r592, %r399;
mov.u32 %r593, %r399;
@%p111 bra $L__BB0_144;
mul.wide.s32 %rd127, %r590, 4;
add.s64 %rd126, %rd41, %rd127;
// begin inline asm
ld.volatile.global.v2.s32 {%r593,%r592}, [%rd126];
// end inline asm
$L__BB0_144:
mov.b32 %f340, %r593;
add.f32 %f467, %f467, %f340;
mov.b32 %f341, %r592;
add.f32 %f466, %f466, %f341;
add.s32 %r590, %r590, %r88;
add.s32 %r589, %r589, %r2;
add.s32 %r591, %r591, 1;
setp.lt.s32 %p112, %r591, %r84;
@%p112 bra $L__BB0_141;
$L__BB0_145:
mov.b32 %r407, %f467;
mov.u32 %r408, 31;
mov.u32 %r409, 16;
mov.u32 %r410, -1;
shfl.sync.bfly.b32 %r411|%p113, %r407, %r409, %r408, %r410;
mov.b32 %f342, %r411;
add.f32 %f343, %f467, %f342;
mov.b32 %r412, %f343;
mov.u32 %r413, 8;
shfl.sync.bfly.b32 %r414|%p114, %r412, %r413, %r408, %r410;
mov.b32 %f344, %r414;
add.f32 %f345, %f343, %f344;
mov.b32 %r415, %f345;
mov.u32 %r416, 4;
shfl.sync.bfly.b32 %r417|%p115, %r415, %r416, %r408, %r410;
mov.b32 %f346, %r417;
add.f32 %f347, %f345, %f346;
mov.b32 %r418, %f347;
mov.u32 %r419, 2;
shfl.sync.bfly.b32 %r420|%p116, %r418, %r419, %r408, %r410;
mov.b32 %f348, %r420;
add.f32 %f349, %f347, %f348;
mov.b32 %r421, %f349;
mov.u32 %r422, 1;
shfl.sync.bfly.b32 %r423|%p117, %r421, %r422, %r408, %r410;
mov.b32 %f350, %r423;
add.f32 %f469, %f349, %f350;
bar.sync 0;
setp.ne.s32 %p118, %r82, 0;
@%p118 bra $L__BB0_147;
st.shared.f32 [%rd29], %f469;
$L__BB0_147:
setp.ne.s32 %p119, %r81, 0;
bar.sync 0;
@%p119 bra $L__BB0_151;
setp.ge.u32 %p120, %r82, %r80;
mov.f32 %f468, 0f00000000;
@%p120 bra $L__BB0_150;
ld.shared.f32 %f468, [%rd30];
$L__BB0_150:
mov.b32 %r424, %f468;
mov.u32 %r425, 31;
mov.u32 %r426, 16;
mov.u32 %r427, -1;
shfl.sync.bfly.b32 %r428|%p121, %r424, %r426, %r425, %r427;
mov.b32 %f352, %r428;
add.f32 %f353, %f468, %f352;
mov.b32 %r429, %f353;
mov.u32 %r430, 8;
shfl.sync.bfly.b32 %r431|%p122, %r429, %r430, %r425, %r427;
mov.b32 %f354, %r431;
add.f32 %f355, %f353, %f354;
mov.b32 %r432, %f355;
mov.u32 %r433, 4;
shfl.sync.bfly.b32 %r434|%p123, %r432, %r433, %r425, %r427;
mov.b32 %f356, %r434;
add.f32 %f357, %f355, %f356;
mov.b32 %r435, %f357;
mov.u32 %r436, 2;
shfl.sync.bfly.b32 %r437|%p124, %r435, %r436, %r425, %r427;
mov.b32 %f358, %r437;
add.f32 %f359, %f357, %f358;
mov.b32 %r438, %f359;
mov.u32 %r439, 1;
shfl.sync.bfly.b32 %r440|%p125, %r438, %r439, %r425, %r427;
mov.b32 %f360, %r440;
add.f32 %f469, %f359, %f360;
$L__BB0_151:
add.f32 %f361, %f469, 0f00000000;
mov.b32 %r441, %f361;
setp.eq.s32 %p127, %r82, 0;
selp.b32 %r102, %r441, 0, %p127;
bar.sync 0;
mov.b32 %r442, %f466;
mov.u32 %r443, 31;
mov.u32 %r444, 16;
mov.u32 %r445, -1;
shfl.sync.bfly.b32 %r446|%p128, %r442, %r444, %r443, %r445;
mov.b32 %f362, %r446;
add.f32 %f363, %f466, %f362;
mov.b32 %r447, %f363;
mov.u32 %r448, 8;
shfl.sync.bfly.b32 %r449|%p129, %r447, %r448, %r443, %r445;
mov.b32 %f364, %r449;
add.f32 %f365, %f363, %f364;
mov.b32 %r450, %f365;
mov.u32 %r451, 4;
shfl.sync.bfly.b32 %r452|%p130, %r450, %r451, %r443, %r445;
mov.b32 %f366, %r452;
add.f32 %f367, %f365, %f366;
mov.b32 %r453, %f367;
mov.u32 %r454, 2;
shfl.sync.bfly.b32 %r455|%p131, %r453, %r454, %r443, %r445;
mov.b32 %f368, %r455;
add.f32 %f369, %f367, %f368;
mov.b32 %r456, %f369;
mov.u32 %r457, 1;
shfl.sync.bfly.b32 %r458|%p132, %r456, %r457, %r443, %r445;
mov.b32 %f370, %r458;
add.f32 %f471, %f369, %f370;
bar.sync 0;
@%p118 bra $L__BB0_153;
st.shared.f32 [%rd29], %f471;
$L__BB0_153:
bar.sync 0;
@%p119 bra $L__BB0_157;
setp.ge.u32 %p134, %r82, %r80;
mov.f32 %f470, 0f00000000;
@%p134 bra $L__BB0_156;
ld.shared.f32 %f470, [%rd30];
$L__BB0_156:
mov.b32 %r459, %f470;
mov.u32 %r460, 31;
mov.u32 %r461, 16;
mov.u32 %r462, -1;
shfl.sync.bfly.b32 %r463|%p135, %r459, %r461, %r460, %r462;
mov.b32 %f372, %r463;
add.f32 %f373, %f470, %f372;
mov.b32 %r464, %f373;
mov.u32 %r465, 8;
shfl.sync.bfly.b32 %r466|%p136, %r464, %r465, %r460, %r462;
mov.b32 %f374, %r466;
add.f32 %f375, %f373, %f374;
mov.b32 %r467, %f375;
mov.u32 %r468, 4;
shfl.sync.bfly.b32 %r469|%p137, %r467, %r468, %r460, %r462;
mov.b32 %f376, %r469;
add.f32 %f377, %f375, %f376;
mov.b32 %r470, %f377;
mov.u32 %r471, 2;
shfl.sync.bfly.b32 %r472|%p138, %r470, %r471, %r460, %r462;
mov.b32 %f378, %r472;
add.f32 %f379, %f377, %f378;
mov.b32 %r473, %f379;
mov.u32 %r474, 1;
shfl.sync.bfly.b32 %r475|%p139, %r473, %r474, %r460, %r462;
mov.b32 %f380, %r475;
add.f32 %f471, %f379, %f380;
$L__BB0_157:
bar.sync 0;
setp.ne.s32 %p140, %r5, 0;
@%p140 bra $L__BB0_160;
mul.lo.s32 %r103, %r78, %r588;
add.s32 %r476, %r77, %r103;
setp.ge.s32 %p141, %r476, %r155;
@%p141 bra $L__BB0_160;
ld.param.u64 %rd134, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_31c3625e_1033910nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r479, %r79, %r103;
mul.wide.s32 %rd129, %r479, 4;
add.s64 %rd128, %rd134, %rd129;
add.f32 %f381, %f471, 0f00000000;
mov.b32 %r480, %f381;
selp.b32 %r478, %r480, 0, %p127;
// begin inline asm
st.global.cs.v2.s32 [%rd128], {%r102,%r478};
// end inline asm
$L__BB0_160:
add.s32 %r588, %r588, 1;
setp.lt.s32 %p143, %r588, %r74;
@%p143 bra $L__BB0_139;
$L__BB0_161:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<184>;
.reg .f32 %f<480>;
.reg .b32 %r<595>;
.reg .f64 %fd<3>;
.reg .b64 %rd<137>;
ld.param.v2.u32 {%r154, %r155}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r164, %r165}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r168, %r169}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r190, %r155, 3;
shr.s32 %r191, %r190, 31;
shr.u32 %r192, %r191, 30;
add.s32 %r193, %r190, %r192;
shr.s32 %r2, %r193, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r194, %r2, %r3;
add.s32 %r195, %r194, 31;
shr.s32 %r196, %r195, 31;
shr.u32 %r197, %r196, 27;
add.s32 %r198, %r195, %r197;
shr.u32 %r199, %r198, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r200, %r4, %r199;
shl.b32 %r201, %r200, 7;
cvt.u64.u32 %rd1, %r201;
mul.lo.s32 %r202, %r4, %r2;
shl.b32 %r203, %r202, 4;
or.b32 %r204, %r203, 15;
and.b32 %r5, %r204, -16;
add.s32 %r205, %r204, %r5;
and.b32 %r206, %r205, -16;
cvt.s64.s32 %rd2, %r206;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r207, %r7, 3;
setp.lt.s32 %p7, %r207, %r155;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r208, smem_ptr; }
// end inline asm
shl.b32 %r211, %r6, 4;
add.s32 %r209, %r208, %r211;
mul.wide.s32 %rd48, %r7, 4;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r210, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r210, 0;
cp.async.ca.shared.global [%r209], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r212, %r4, 215;
div.s32 %r213, %r212, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r214, %r9, %r213;
add.s32 %r215, %r214, -1;
div.s32 %r10, %r215, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r155;
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r217, %ctaid.y;
mul.lo.s32 %r218, %r10, %r4;
mul.lo.s32 %r11, %r218, %r217;
mad.lo.s32 %r219, %r2, %r8, %r6;
shl.b32 %r12, %r219, 4;
mul.lo.s32 %r220, %r155, %r8;
cvt.s64.s32 %rd53, %r220;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r221, %r11, %r155;
cvt.s64.s32 %rd6, %r221;
mul.lo.s32 %r13, %r155, %r4;
mul.lo.s32 %r14, %r10, %r217;
shl.b32 %r222, %r8, 2;
mad.lo.s32 %r223, %r222, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r223, 4;
add.s64 %rd7, %rd55, %rd56;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r224, %tid.z;
mad.lo.s32 %r225, %r224, %r4, %r8;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r226, %r225, %r15;
shr.u32 %r16, %r6, 5;
add.s32 %r227, %r226, %r16;
mul.wide.u32 %rd57, %r227, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r6, 31;
add.s32 %r228, %r226, %r17;
mul.wide.u32 %rd58, %r228, 4;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r225, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
mov.u32 %r563, 0;
mov.f32 %f438, 0f00000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r231, smem_ptr; }
// end inline asm
add.s32 %r232, %r231, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
add.s32 %r258, %r257, %r12;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r229, %r563, %r4, %r8;
add.s32 %r230, %r229, %r11;
setp.gt.s32 %p12, %r230, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r234, %r13, %r563;
cvt.s64.s32 %rd65, %r234;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r233, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r233, 0;
cp.async.ca.shared.global [%r232], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r235, %r14, %r563;
mad.lo.s32 %r236, %r235, %r4, %r8;
setp.lt.s32 %p14, %r236, 216;
@%p14 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r245, %r14, %r563;
mad.lo.s32 %r246, %r245, %r4, %r8;
setp.gt.s32 %p15, %r246, 215;
mov.u32 %r564, 0;
mov.u32 %r565, %r564;
mov.u32 %r566, %r564;
mov.u32 %r567, %r564;
@%p15 bra $L__BB0_15;
ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r564, 0;
mov.u32 %r565, %r564;
mov.u32 %r566, %r564;
mov.u32 %r567, %r564;
$L__BB0_15:
add.s32 %r255, %r14, %r563;
mad.lo.s32 %r33, %r255, %r4, %r8;
mov.b32 %f125, %r567;
add.f32 %f455, %f455, %f125;
mov.b32 %f126, %r566;
add.f32 %f454, %f454, %f126;
mov.b32 %f127, %r565;
add.f32 %f453, %f453, %f127;
mov.b32 %f128, %r564;
add.f32 %f452, %f452, %f128;
setp.gt.s32 %p16, %r33, 215;
mov.f32 %f436, 0f00000000;
@%p16 bra $L__BB0_17;
mul.lo.s32 %r256, %r33, %r164;
mul.wide.s32 %rd69, %r256, 4;
add.s64 %rd70, %rd15, %rd69;
ld.global.f32 %f436, [%rd70];
$L__BB0_17:
setp.lt.s32 %p17, %r33, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_19;
mul.lo.s32 %r260, %r13, %r563;
cvt.s64.s32 %rd73, %r260;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
mov.f32 %f442, 0f00000000;
mov.f32 %f437, %f442;
@%p16 bra $L__BB0_21;
mul.lo.s32 %r261, %r33, %r168;
mul.wide.s32 %rd77, %r261, 4;
add.s64 %rd78, %rd16, %rd77;
ld.global.f32 %f437, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f443, %f442;
@%p18 bra $L__BB0_23;
ld.shared.v4.f32 {%f132, %f133, %f134, %f135}, [%rd11];
ld.shared.v4.f32 {%f137, %f138, %f139, %f140}, [%rd7];
mul.f32 %f142, %f132, %f137;
add.f32 %f143, %f142, 0f00000000;
ld.shared.v4.f32 {%f144, %f145, %f146, %f147}, [%rd9];
sub.f32 %f149, %f144, %f436;
mul.f32 %f150, %f437, %f149;
fma.rn.f32 %f151, %f142, %f150, 0f00000000;
fma.rn.f32 %f438, %f150, %f137, %f438;
mul.f32 %f154, %f133, %f138;
add.f32 %f155, %f143, %f154;
sub.f32 %f157, %f145, %f436;
mul.f32 %f158, %f437, %f157;
fma.rn.f32 %f159, %f154, %f158, %f151;
fma.rn.f32 %f439, %f158, %f138, %f439;
mul.f32 %f162, %f134, %f139;
add.f32 %f163, %f155, %f162;
sub.f32 %f165, %f146, %f436;
mul.f32 %f166, %f437, %f165;
fma.rn.f32 %f167, %f162, %f166, %f159;
fma.rn.f32 %f440, %f166, %f139, %f440;
mul.f32 %f170, %f135, %f140;
add.f32 %f443, %f163, %f170;
sub.f32 %f172, %f147, %f436;
mul.f32 %f173, %f437, %f172;
fma.rn.f32 %f442, %f170, %f173, %f167;
fma.rn.f32 %f441, %f173, %f140, %f441;
$L__BB0_23:
mov.b32 %r262, %f443;
mov.u32 %r263, 31;
mov.u32 %r264, 16;
mov.u32 %r265, -1;
shfl.sync.bfly.b32 %r266|%p21, %r262, %r264, %r263, %r265;
mov.b32 %f174, %r266;
add.f32 %f175, %f443, %f174;
mov.b32 %r267, %f175;
mov.u32 %r268, 8;
shfl.sync.bfly.b32 %r269|%p22, %r267, %r268, %r263, %r265;
mov.b32 %f176, %r269;
add.f32 %f177, %f175, %f176;
mov.b32 %r270, %f177;
mov.u32 %r271, 4;
shfl.sync.bfly.b32 %r272|%p23, %r270, %r271, %r263, %r265;
mov.b32 %f178, %r272;
add.f32 %f179, %f177, %f178;
mov.b32 %r273, %f179;
mov.u32 %r274, 2;
shfl.sync.bfly.b32 %r275|%p24, %r273, %r274, %r263, %r265;
mov.b32 %f180, %r275;
add.f32 %f181, %f179, %f180;
mov.b32 %r276, %f181;
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r278|%p25, %r276, %r277, %r263, %r265;
mov.b32 %f182, %r278;
add.f32 %f445, %f181, %f182;
bar.sync 0;
setp.ne.s32 %p26, %r17, 0;
@%p26 bra $L__BB0_25;
st.shared.f32 [%rd8], %f445;
$L__BB0_25:
setp.ne.s32 %p27, %r16, 0;
bar.sync 0;
@%p27 bra $L__BB0_29;
setp.ge.u32 %p28, %r17, %r15;
mov.f32 %f444, 0f00000000;
@%p28 bra $L__BB0_28;
ld.shared.f32 %f444, [%rd10];
$L__BB0_28:
mov.b32 %r279, %f444;
mov.u32 %r280, 31;
mov.u32 %r281, 16;
mov.u32 %r282, -1;
shfl.sync.bfly.b32 %r283|%p29, %r279, %r281, %r280, %r282;
mov.b32 %f184, %r283;
add.f32 %f185, %f444, %f184;
mov.b32 %r284, %f185;
mov.u32 %r285, 8;
shfl.sync.bfly.b32 %r286|%p30, %r284, %r285, %r280, %r282;
mov.b32 %f186, %r286;
add.f32 %f187, %f185, %f186;
mov.b32 %r287, %f187;
mov.u32 %r288, 4;
shfl.sync.bfly.b32 %r289|%p31, %r287, %r288, %r280, %r282;
mov.b32 %f188, %r289;
add.f32 %f189, %f187, %f188;
mov.b32 %r290, %f189;
mov.u32 %r291, 2;
shfl.sync.bfly.b32 %r292|%p32, %r290, %r291, %r280, %r282;
mov.b32 %f190, %r292;
add.f32 %f191, %f189, %f190;
mov.b32 %r293, %f191;
mov.u32 %r294, 1;
shfl.sync.bfly.b32 %r295|%p33, %r293, %r294, %r280, %r282;
mov.b32 %f192, %r295;
add.f32 %f445, %f191, %f192;
$L__BB0_29:
setp.ne.s32 %p181, %r17, 0;
bar.sync 0;
mov.b32 %r296, %f442;
mov.u32 %r297, 31;
mov.u32 %r298, 16;
mov.u32 %r299, -1;
shfl.sync.bfly.b32 %r300|%p34, %r296, %r298, %r297, %r299;
mov.b32 %f193, %r300;
add.f32 %f194, %f442, %f193;
mov.b32 %r301, %f194;
mov.u32 %r302, 8;
shfl.sync.bfly.b32 %r303|%p35, %r301, %r302, %r297, %r299;
mov.b32 %f195, %r303;
add.f32 %f196, %f194, %f195;
mov.b32 %r304, %f196;
mov.u32 %r305, 4;
shfl.sync.bfly.b32 %r306|%p36, %r304, %r305, %r297, %r299;
mov.b32 %f197, %r306;
add.f32 %f198, %f196, %f197;
mov.b32 %r307, %f198;
mov.u32 %r308, 2;
shfl.sync.bfly.b32 %r309|%p37, %r307, %r308, %r297, %r299;
mov.b32 %f199, %r309;
add.f32 %f200, %f198, %f199;
mov.b32 %r310, %f200;
mov.u32 %r311, 1;
shfl.sync.bfly.b32 %r312|%p38, %r310, %r311, %r297, %r299;
mov.b32 %f201, %r312;
add.f32 %f447, %f200, %f201;
bar.sync 0;
@%p181 bra $L__BB0_31;
st.shared.f32 [%rd8], %f447;
$L__BB0_31:
setp.eq.s32 %p183, %r17, 0;
setp.ne.s32 %p178, %r16, 0;
bar.sync 0;
add.f32 %f202, %f445, 0f00000000;
selp.f32 %f37, %f202, 0f00000000, %p183;
@%p178 bra $L__BB0_35;
setp.ge.u32 %p41, %r17, %r15;
mov.f32 %f446, 0f00000000;
@%p41 bra $L__BB0_34;
ld.shared.f32 %f446, [%rd10];
$L__BB0_34:
mov.b32 %r313, %f446;
mov.u32 %r314, 31;
mov.u32 %r315, 16;
mov.u32 %r316, -1;
shfl.sync.bfly.b32 %r317|%p42, %r313, %r315, %r314, %r316;
mov.b32 %f204, %r317;
add.f32 %f205, %f446, %f204;
mov.b32 %r318, %f205;
mov.u32 %r319, 8;
shfl.sync.bfly.b32 %r320|%p43, %r318, %r319, %r314, %r316;
mov.b32 %f206, %r320;
add.f32 %f207, %f205, %f206;
mov.b32 %r321, %f207;
mov.u32 %r322, 4;
shfl.sync.bfly.b32 %r323|%p44, %r321, %r322, %r314, %r316;
mov.b32 %f208, %r323;
add.f32 %f209, %f207, %f208;
mov.b32 %r324, %f209;
mov.u32 %r325, 2;
shfl.sync.bfly.b32 %r326|%p45, %r324, %r325, %r314, %r316;
mov.b32 %f210, %r326;
add.f32 %f211, %f209, %f210;
mov.b32 %r327, %f211;
mov.u32 %r328, 1;
shfl.sync.bfly.b32 %r329|%p46, %r327, %r328, %r314, %r316;
mov.b32 %f212, %r329;
add.f32 %f447, %f211, %f212;
$L__BB0_35:
bar.sync 0;
setp.ne.s32 %p47, %r6, 0;
@%p47 bra $L__BB0_37;
st.shared.f32 [%rd12], %f37;
$L__BB0_37:
bar.sync 0;
ld.shared.f32 %f42, [%rd12];
bar.sync 0;
@%p47 bra $L__BB0_39;
setp.eq.s32 %p182, %r17, 0;
add.f32 %f213, %f447, 0f00000000;
selp.f32 %f214, %f213, 0f00000000, %p182;
st.shared.f32 [%rd12], %f214;
$L__BB0_39:
bar.sync 0;
ld.shared.f32 %f43, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_41;
mul.f32 %f215, %f437, %f1;
ld.shared.v4.f32 {%f216, %f217, %f218, %f219}, [%rd11];
ld.shared.v4.f32 {%f221, %f222, %f223, %f224}, [%rd7];
mul.f32 %f226, %f216, %f221;
mul.f32 %f227, %f226, %f2;
ld.shared.v4.f32 {%f228, %f229, %f230, %f231}, [%rd9];
sub.f32 %f233, %f228, %f436;
mul.f32 %f234, %f437, %f233;
sub.f32 %f235, %f227, %f42;
mul.f32 %f236, %f43, %f234;
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f215, %f237;
mov.b32 %r330, %f238;
mul.f32 %f241, %f217, %f222;
mul.f32 %f242, %f241, %f2;
sub.f32 %f244, %f229, %f436;
mul.f32 %f245, %f437, %f244;
sub.f32 %f246, %f242, %f42;
mul.f32 %f247, %f43, %f245;
sub.f32 %f248, %f246, %f247;
mul.f32 %f249, %f215, %f248;
mov.b32 %r331, %f249;
mul.f32 %f252, %f218, %f223;
mul.f32 %f253, %f252, %f2;
sub.f32 %f255, %f230, %f436;
mul.f32 %f256, %f437, %f255;
sub.f32 %f257, %f253, %f42;
mul.f32 %f258, %f43, %f256;
sub.f32 %f259, %f257, %f258;
mul.f32 %f260, %f215, %f259;
mov.b32 %r332, %f260;
mul.f32 %f263, %f219, %f224;
mul.f32 %f264, %f263, %f2;
sub.f32 %f266, %f231, %f436;
mul.f32 %f267, %f437, %f266;
sub.f32 %f268, %f264, %f42;
mul.f32 %f269, %f43, %f267;
sub.f32 %f270, %f268, %f269;
mul.f32 %f271, %f215, %f270;
mov.b32 %r333, %f271;
mad.lo.s32 %r334, %r33, %r155, %r7;
mul.wide.s32 %rd80, %r334, 4;
add.s64 %rd79, %rd38, %rd80;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_41:
add.s32 %r563, %r563, 1;
setp.lt.s32 %p51, %r563, %r10;
@%p51 bra $L__BB0_5;
bra.uni $L__BB0_42;
$L__BB0_3:
mov.f32 %f438, 0f00000000;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_42:
mov.u32 %r335, %tid.z;
mad.lo.s32 %r35, %r335, %r4, %r8;
mad.lo.s32 %r36, %r35, %r3, %r6;
mul.wide.u32 %rd81, %r36, 4;
add.s64 %rd22, %rd44, %rd81;
clz.b32 %r336, %r4;
mov.u32 %r337, 31;
sub.s32 %r338, %r337, %r336;
mov.u32 %r339, 1;
shl.b32 %r37, %r339, %r338;
setp.lt.u32 %p52, %r8, %r37;
add.s32 %r340, %r37, %r8;
setp.lt.u32 %p53, %r340, %r4;
and.pred %p5, %p52, %p53;
shl.b32 %r341, %r3, %r338;
add.s32 %r342, %r36, %r341;
mul.wide.s32 %rd83, %r342, 4;
add.s64 %rd23, %rd44, %rd83;
shr.u32 %r343, %r37, 31;
add.s32 %r344, %r37, %r343;
shr.s32 %r582, %r344, 1;
st.shared.f32 [%rd22], %f438;
bar.sync 0;
not.pred %p54, %p5;
@%p54 bra $L__BB0_44;
ld.shared.f32 %f272, [%rd23];
ld.shared.f32 %f273, [%rd22];
add.f32 %f274, %f272, %f273;
st.shared.f32 [%rd22], %f274;
$L__BB0_44:
setp.lt.s32 %p55, %r37, 4;
bar.sync 0;
@%p55 bra $L__BB0_49;
mov.u32 %r568, %r582;
$L__BB0_46:
setp.ge.u32 %p56, %r8, %r568;
@%p56 bra $L__BB0_48;
mad.lo.s32 %r345, %r568, %r3, %r36;
mul.wide.s32 %rd84, %r345, 4;
add.s64 %rd86, %rd44, %rd84;
ld.shared.f32 %f275, [%rd22];
ld.shared.f32 %f276, [%rd86];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd22], %f277;
$L__BB0_48:
bar.sync 0;
shr.u32 %r40, %r568, 1;
setp.gt.u32 %p57, %r568, 3;
mov.u32 %r568, %r40;
@%p57 bra $L__BB0_46;
$L__BB0_49:
add.s32 %r347, %r36, %r3;
mul.wide.u32 %rd87, %r347, 4;
add.s64 %rd24, %rd44, %rd87;
setp.ne.s32 %p58, %r8, 0;
mov.u32 %r569, 0;
@%p58 bra $L__BB0_53;
setp.lt.u32 %p59, %r4, 2;
ld.shared.f32 %f278, [%rd22];
add.f32 %f456, %f278, 0f00000000;
@%p59 bra $L__BB0_52;
ld.shared.f32 %f279, [%rd24];
add.f32 %f456, %f456, %f279;
$L__BB0_52:
mov.b32 %r569, %f456;
$L__BB0_53:
bar.sync 0;
st.shared.f32 [%rd22], %f439;
bar.sync 0;
@%p54 bra $L__BB0_55;
ld.shared.f32 %f280, [%rd23];
ld.shared.f32 %f281, [%rd22];
add.f32 %f282, %f280, %f281;
st.shared.f32 [%rd22], %f282;
$L__BB0_55:
bar.sync 0;
@%p55 bra $L__BB0_60;
mov.u32 %r570, %r582;
$L__BB0_57:
setp.ge.u32 %p62, %r8, %r570;
@%p62 bra $L__BB0_59;
mad.lo.s32 %r348, %r570, %r3, %r36;
mul.wide.s32 %rd89, %r348, 4;
add.s64 %rd91, %rd44, %rd89;
ld.shared.f32 %f283, [%rd22];
ld.shared.f32 %f284, [%rd91];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd22], %f285;
$L__BB0_59:
bar.sync 0;
shr.u32 %r44, %r570, 1;
setp.gt.u32 %p63, %r570, 3;
mov.u32 %r570, %r44;
@%p63 bra $L__BB0_57;
$L__BB0_60:
mov.u32 %r571, 0;
@%p58 bra $L__BB0_64;
setp.lt.u32 %p65, %r4, 2;
ld.shared.f32 %f286, [%rd22];
add.f32 %f457, %f286, 0f00000000;
@%p65 bra $L__BB0_63;
ld.shared.f32 %f287, [%rd24];
add.f32 %f457, %f457, %f287;
$L__BB0_63:
mov.b32 %r571, %f457;
$L__BB0_64:
bar.sync 0;
st.shared.f32 [%rd22], %f440;
bar.sync 0;
@%p54 bra $L__BB0_66;
ld.shared.f32 %f288, [%rd23];
ld.shared.f32 %f289, [%rd22];
add.f32 %f290, %f288, %f289;
st.shared.f32 [%rd22], %f290;
$L__BB0_66:
bar.sync 0;
@%p55 bra $L__BB0_71;
mov.u32 %r572, %r582;
$L__BB0_68:
setp.ge.u32 %p68, %r8, %r572;
@%p68 bra $L__BB0_70;
mad.lo.s32 %r350, %r572, %r3, %r36;
mul.wide.s32 %rd92, %r350, 4;
add.s64 %rd94, %rd44, %rd92;
ld.shared.f32 %f291, [%rd22];
ld.shared.f32 %f292, [%rd94];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd22], %f293;
$L__BB0_70:
bar.sync 0;
shr.u32 %r48, %r572, 1;
setp.gt.u32 %p69, %r572, 3;
mov.u32 %r572, %r48;
@%p69 bra $L__BB0_68;
$L__BB0_71:
mov.u32 %r573, 0;
@%p58 bra $L__BB0_75;
setp.lt.u32 %p71, %r4, 2;
ld.shared.f32 %f294, [%rd22];
add.f32 %f458, %f294, 0f00000000;
@%p71 bra $L__BB0_74;
ld.shared.f32 %f295, [%rd24];
add.f32 %f458, %f458, %f295;
$L__BB0_74:
mov.b32 %r573, %f458;
$L__BB0_75:
bar.sync 0;
st.shared.f32 [%rd22], %f441;
bar.sync 0;
@%p54 bra $L__BB0_77;
ld.shared.f32 %f296, [%rd23];
ld.shared.f32 %f297, [%rd22];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd22], %f298;
$L__BB0_77:
bar.sync 0;
@%p55 bra $L__BB0_82;
mov.u32 %r574, %r582;
$L__BB0_79:
setp.ge.u32 %p74, %r8, %r574;
@%p74 bra $L__BB0_81;
mad.lo.s32 %r352, %r574, %r3, %r36;
mul.wide.s32 %rd95, %r352, 4;
add.s64 %rd97, %rd44, %rd95;
ld.shared.f32 %f299, [%rd22];
ld.shared.f32 %f300, [%rd97];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd22], %f301;
$L__BB0_81:
bar.sync 0;
shr.u32 %r52, %r574, 1;
setp.gt.u32 %p75, %r574, 3;
mov.u32 %r574, %r52;
@%p75 bra $L__BB0_79;
$L__BB0_82:
mov.u32 %r575, 0;
@%p58 bra $L__BB0_86;
setp.lt.u32 %p77, %r4, 2;
ld.shared.f32 %f302, [%rd22];
add.f32 %f459, %f302, 0f00000000;
@%p77 bra $L__BB0_85;
ld.shared.f32 %f303, [%rd24];
add.f32 %f459, %f459, %f303;
$L__BB0_85:
mov.b32 %r575, %f459;
$L__BB0_86:
bar.sync 0;
st.shared.f32 [%rd22], %f452;
bar.sync 0;
@%p54 bra $L__BB0_88;
ld.shared.f32 %f304, [%rd23];
ld.shared.f32 %f305, [%rd22];
add.f32 %f306, %f304, %f305;
st.shared.f32 [%rd22], %f306;
$L__BB0_88:
bar.sync 0;
@%p55 bra $L__BB0_93;
mov.u32 %r576, %r582;
$L__BB0_90:
setp.ge.u32 %p80, %r8, %r576;
@%p80 bra $L__BB0_92;
mad.lo.s32 %r354, %r576, %r3, %r36;
mul.wide.s32 %rd98, %r354, 4;
add.s64 %rd100, %rd44, %rd98;
ld.shared.f32 %f307, [%rd22];
ld.shared.f32 %f308, [%rd100];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd22], %f309;
$L__BB0_92:
bar.sync 0;
shr.u32 %r56, %r576, 1;
setp.gt.u32 %p81, %r576, 3;
mov.u32 %r576, %r56;
@%p81 bra $L__BB0_90;
$L__BB0_93:
mov.u32 %r577, 0;
@%p58 bra $L__BB0_97;
setp.lt.u32 %p83, %r4, 2;
ld.shared.f32 %f310, [%rd22];
add.f32 %f460, %f310, 0f00000000;
@%p83 bra $L__BB0_96;
ld.shared.f32 %f311, [%rd24];
add.f32 %f460, %f460, %f311;
$L__BB0_96:
mov.b32 %r577, %f460;
$L__BB0_97:
bar.sync 0;
st.shared.f32 [%rd22], %f453;
bar.sync 0;
@%p54 bra $L__BB0_99;
ld.shared.f32 %f312, [%rd23];
ld.shared.f32 %f313, [%rd22];
add.f32 %f314, %f312, %f313;
st.shared.f32 [%rd22], %f314;
$L__BB0_99:
bar.sync 0;
@%p55 bra $L__BB0_104;
mov.u32 %r578, %r582;
$L__BB0_101:
setp.ge.u32 %p86, %r8, %r578;
@%p86 bra $L__BB0_103;
mad.lo.s32 %r356, %r578, %r3, %r36;
mul.wide.s32 %rd101, %r356, 4;
add.s64 %rd103, %rd44, %rd101;
ld.shared.f32 %f315, [%rd22];
ld.shared.f32 %f316, [%rd103];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd22], %f317;
$L__BB0_103:
bar.sync 0;
shr.u32 %r60, %r578, 1;
setp.gt.u32 %p87, %r578, 3;
mov.u32 %r578, %r60;
@%p87 bra $L__BB0_101;
$L__BB0_104:
mov.u32 %r579, 0;
@%p58 bra $L__BB0_108;
setp.lt.u32 %p89, %r4, 2;
ld.shared.f32 %f318, [%rd22];
add.f32 %f461, %f318, 0f00000000;
@%p89 bra $L__BB0_107;
ld.shared.f32 %f319, [%rd24];
add.f32 %f461, %f461, %f319;
$L__BB0_107:
mov.b32 %r579, %f461;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd22], %f454;
bar.sync 0;
@%p54 bra $L__BB0_110;
ld.shared.f32 %f320, [%rd23];
ld.shared.f32 %f321, [%rd22];
add.f32 %f322, %f320, %f321;
st.shared.f32 [%rd22], %f322;
$L__BB0_110:
bar.sync 0;
@%p55 bra $L__BB0_115;
mov.u32 %r580, %r582;
$L__BB0_112:
setp.ge.u32 %p92, %r8, %r580;
@%p92 bra $L__BB0_114;
mad.lo.s32 %r358, %r580, %r3, %r36;
mul.wide.s32 %rd104, %r358, 4;
add.s64 %rd106, %rd44, %rd104;
ld.shared.f32 %f323, [%rd22];
ld.shared.f32 %f324, [%rd106];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd22], %f325;
$L__BB0_114:
bar.sync 0;
shr.u32 %r64, %r580, 1;
setp.gt.u32 %p93, %r580, 3;
mov.u32 %r580, %r64;
@%p93 bra $L__BB0_112;
$L__BB0_115:
mov.u32 %r581, 0;
@%p58 bra $L__BB0_119;
setp.lt.u32 %p95, %r4, 2;
ld.shared.f32 %f326, [%rd22];
add.f32 %f462, %f326, 0f00000000;
@%p95 bra $L__BB0_118;
ld.shared.f32 %f327, [%rd24];
add.f32 %f462, %f462, %f327;
$L__BB0_118:
mov.b32 %r581, %f462;
$L__BB0_119:
bar.sync 0;
st.shared.f32 [%rd22], %f455;
bar.sync 0;
@%p54 bra $L__BB0_121;
ld.shared.f32 %f328, [%rd23];
ld.shared.f32 %f329, [%rd22];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd22], %f330;
$L__BB0_121:
bar.sync 0;
@%p55 bra $L__BB0_125;
$L__BB0_122:
setp.ge.u32 %p98, %r8, %r582;
@%p98 bra $L__BB0_124;
mad.lo.s32 %r360, %r582, %r3, %r36;
mul.wide.s32 %rd107, %r360, 4;
add.s64 %rd109, %rd44, %rd107;
ld.shared.f32 %f331, [%rd22];
ld.shared.f32 %f332, [%rd109];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd22], %f333;
$L__BB0_124:
bar.sync 0;
shr.u32 %r68, %r582, 1;
setp.gt.u32 %p99, %r582, 3;
mov.u32 %r582, %r68;
@%p99 bra $L__BB0_122;
$L__BB0_125:
mov.u32 %r583, 0;
@%p58 bra $L__BB0_129;
setp.lt.u32 %p101, %r4, 2;
ld.shared.f32 %f334, [%rd22];
add.f32 %f463, %f334, 0f00000000;
@%p101 bra $L__BB0_128;
ld.shared.f32 %f335, [%rd24];
add.f32 %f463, %f463, %f335;
$L__BB0_128:
mov.b32 %r583, %f463;
$L__BB0_129:
setp.eq.s32 %p180, %r8, 0;
and.pred %p179, %p180, %p1;
bar.sync 0;
@%p179 bra $L__BB0_130;
bra.uni $L__BB0_131;
$L__BB0_130:
mov.u32 %r370, %ctaid.y;
mad.lo.s32 %r371, %r155, %r370, %r7;
mul.wide.s32 %rd112, %r371, 4;
add.s64 %rd110, %rd41, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd110], {%r569,%r571,%r573,%r575};
// end inline asm
add.s64 %rd111, %rd42, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd111], {%r577,%r579,%r581,%r583};
// end inline asm
$L__BB0_131:
mov.u32 %r71, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r372, %r6, %r8;
or.b32 %r374, %r372, %r335;
setp.ne.s32 %p102, %r374, 0;
@%p102 bra $L__BB0_135;
ld.param.u64 %rd136, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd136;
mov.u32 %r375, %ctaid.x;
mov.u32 %r376, %ctaid.z;
mov.u32 %r377, %nctaid.x;
mad.lo.s32 %r378, %r376, %r377, %r375;
mul.wide.s32 %rd114, %r378, 8;
add.s64 %rd27, %rd113, %rd114;
add.s32 %r379, %r9, -1;
setp.eq.s32 %p103, %r71, %r379;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p103;
atom.global.add.u64 %rd28, [%rd27], %rd118;
ld.volatile.global.u64 %rd119, [%rd27];
xor.b64 %rd120, %rd119, %rd28;
setp.lt.s64 %p104, %rd120, 0;
@%p104 bra $L__BB0_135;
mov.u32 %r584, 8;
$L__BB0_134:
// begin inline asm
nanosleep.u32 %r584;
// end inline asm
setp.lt.u32 %p105, %r584, 256;
selp.u32 %r382, 1, 0, %p105;
shl.b32 %r584, %r584, %r382;
ld.volatile.global.u64 %rd121, [%rd27];
xor.b64 %rd122, %rd121, %rd28;
setp.gt.s64 %p106, %rd122, -1;
@%p106 bra $L__BB0_134;
$L__BB0_135:
bar.sync 0;
add.s32 %r384, %r155, 1;
shr.u32 %r385, %r384, 31;
add.s32 %r386, %r384, %r385;
shr.s32 %r387, %r386, 1;
add.s32 %r388, %r4, %r387;
add.s32 %r389, %r388, -1;
div.s32 %r390, %r389, %r4;
add.s32 %r391, %r9, -1;
add.s32 %r392, %r391, %r390;
div.s32 %r74, %r392, %r9;
add.s32 %r75, %r391, %r3;
shl.b32 %r76, %r8, 1;
shl.b32 %r393, %r4, 1;
mad.lo.s32 %r79, %r393, %r71, %r76;
or.b32 %r77, %r79, 1;
mul.lo.s32 %r78, %r393, %r9;
shr.u32 %r80, %r3, 5;
mul.lo.s32 %r394, %r35, %r80;
shr.u32 %r81, %r6, 5;
add.s32 %r395, %r394, %r81;
mul.wide.u32 %rd123, %r395, 4;
add.s64 %rd29, %rd44, %rd123;
and.b32 %r82, %r6, 31;
add.s32 %r396, %r394, %r82;
mul.wide.u32 %rd125, %r396, 4;
add.s64 %rd30, %rd44, %rd125;
mov.u32 %r585, 0;
bra.uni $L__BB0_136;
$L__BB0_183:
add.s32 %r585, %r585, 1;
$L__BB0_136:
.pragma "nounroll";
setp.lt.s32 %p107, %r585, %r74;
@%p107 bra $L__BB0_162;
bra.uni $L__BB0_137;
$L__BB0_162:
div.s32 %r105, %r75, %r3;
setp.lt.s32 %p144, %r105, 1;
mov.f32 %f474, 0f00000000;
mov.f32 %f475, %f474;
@%p144 bra $L__BB0_168;
mul.lo.s32 %r481, %r78, %r585;
add.s32 %r106, %r77, %r481;
add.s32 %r107, %r79, %r481;
mov.u32 %r480, 0;
mov.f32 %f474, 0f00000000;
mov.u32 %r592, %r480;
$L__BB0_164:
.pragma "nounroll";
setp.ge.s32 %p145, %r106, %r155;
mov.u32 %r593, %r480;
mov.u32 %r594, %r480;
@%p145 bra $L__BB0_167;
mad.lo.s32 %r109, %r592, %r3, %r6;
setp.ge.s32 %p146, %r109, %r9;
mov.u32 %r593, %r480;
mov.u32 %r594, %r480;
@%p146 bra $L__BB0_167;
mad.lo.s32 %r488, %r109, %r155, %r107;
mul.wide.s32 %rd131, %r488, 4;
add.s64 %rd130, %rd42, %rd131;
// begin inline asm
ld.volatile.global.v2.s32 {%r594,%r593}, [%rd130];
// end inline asm
$L__BB0_167:
mov.b32 %f386, %r594;
add.f32 %f475, %f475, %f386;
mov.b32 %f387, %r593;
add.f32 %f474, %f474, %f387;
add.s32 %r592, %r592, 1;
setp.lt.s32 %p147, %r592, %r105;
@%p147 bra $L__BB0_164;
$L__BB0_168:
mov.b32 %r489, %f475;
mov.u32 %r490, 31;
mov.u32 %r491, 16;
mov.u32 %r492, -1;
shfl.sync.bfly.b32 %r493|%p148, %r489, %r491, %r490, %r492;
mov.b32 %f388, %r493;
add.f32 %f389, %f475, %f388;
mov.b32 %r494, %f389;
mov.u32 %r495, 8;
shfl.sync.bfly.b32 %r496|%p149, %r494, %r495, %r490, %r492;
mov.b32 %f390, %r496;
add.f32 %f391, %f389, %f390;
mov.b32 %r497, %f391;
mov.u32 %r498, 4;
shfl.sync.bfly.b32 %r499|%p150, %r497, %r498, %r490, %r492;
mov.b32 %f392, %r499;
add.f32 %f393, %f391, %f392;
mov.b32 %r500, %f393;
mov.u32 %r501, 2;
shfl.sync.bfly.b32 %r502|%p151, %r500, %r501, %r490, %r492;
mov.b32 %f394, %r502;
add.f32 %f395, %f393, %f394;
mov.b32 %r503, %f395;
mov.u32 %r504, 1;
shfl.sync.bfly.b32 %r505|%p152, %r503, %r504, %r490, %r492;
mov.b32 %f396, %r505;
add.f32 %f477, %f395, %f396;
bar.sync 0;
setp.ne.s32 %p153, %r82, 0;
@%p153 bra $L__BB0_170;
st.shared.f32 [%rd29], %f477;
$L__BB0_170:
setp.ne.s32 %p154, %r81, 0;
bar.sync 0;
@%p154 bra $L__BB0_174;
setp.ge.u32 %p155, %r82, %r80;
mov.f32 %f476, 0f00000000;
@%p155 bra $L__BB0_173;
ld.shared.f32 %f476, [%rd30];
$L__BB0_173:
mov.b32 %r506, %f476;
mov.u32 %r507, 31;
mov.u32 %r508, 16;
mov.u32 %r509, -1;
shfl.sync.bfly.b32 %r510|%p156, %r506, %r508, %r507, %r509;
mov.b32 %f398, %r510;
add.f32 %f399, %f476, %f398;
mov.b32 %r511, %f399;
mov.u32 %r512, 8;
shfl.sync.bfly.b32 %r513|%p157, %r511, %r512, %r507, %r509;
mov.b32 %f400, %r513;
add.f32 %f401, %f399, %f400;
mov.b32 %r514, %f401;
mov.u32 %r515, 4;
shfl.sync.bfly.b32 %r516|%p158, %r514, %r515, %r507, %r509;
mov.b32 %f402, %r516;
add.f32 %f403, %f401, %f402;
mov.b32 %r517, %f403;
mov.u32 %r518, 2;
shfl.sync.bfly.b32 %r519|%p159, %r517, %r518, %r507, %r509;
mov.b32 %f404, %r519;
add.f32 %f405, %f403, %f404;
mov.b32 %r520, %f405;
mov.u32 %r521, 1;
shfl.sync.bfly.b32 %r522|%p160, %r520, %r521, %r507, %r509;
mov.b32 %f406, %r522;
add.f32 %f477, %f405, %f406;
$L__BB0_174:
add.f32 %f407, %f477, 0f00000000;
mov.b32 %r523, %f407;
setp.eq.s32 %p162, %r82, 0;
selp.b32 %r115, %r523, 0, %p162;
bar.sync 0;
mov.b32 %r524, %f474;
mov.u32 %r525, 31;
mov.u32 %r526, 16;
mov.u32 %r527, -1;
shfl.sync.bfly.b32 %r528|%p163, %r524, %r526, %r525, %r527;
mov.b32 %f408, %r528;
add.f32 %f409, %f474, %f408;
mov.b32 %r529, %f409;
mov.u32 %r530, 8;
shfl.sync.bfly.b32 %r531|%p164, %r529, %r530, %r525, %r527;
mov.b32 %f410, %r531;
add.f32 %f411, %f409, %f410;
mov.b32 %r532, %f411;
mov.u32 %r533, 4;
shfl.sync.bfly.b32 %r534|%p165, %r532, %r533, %r525, %r527;
mov.b32 %f412, %r534;
add.f32 %f413, %f411, %f412;
mov.b32 %r535, %f413;
mov.u32 %r536, 2;
shfl.sync.bfly.b32 %r537|%p166, %r535, %r536, %r525, %r527;
mov.b32 %f414, %r537;
add.f32 %f415, %f413, %f414;
mov.b32 %r538, %f415;
mov.u32 %r539, 1;
shfl.sync.bfly.b32 %r540|%p167, %r538, %r539, %r525, %r527;
mov.b32 %f416, %r540;
add.f32 %f479, %f415, %f416;
bar.sync 0;
@%p153 bra $L__BB0_176;
st.shared.f32 [%rd29], %f479;
$L__BB0_176:
bar.sync 0;
@%p154 bra $L__BB0_180;
setp.ge.u32 %p169, %r82, %r80;
mov.f32 %f478, 0f00000000;
@%p169 bra $L__BB0_179;
ld.shared.f32 %f478, [%rd30];
$L__BB0_179:
mov.b32 %r541, %f478;
mov.u32 %r542, 31;
mov.u32 %r543, 16;
mov.u32 %r544, -1;
shfl.sync.bfly.b32 %r545|%p170, %r541, %r543, %r542, %r544;
mov.b32 %f418, %r545;
add.f32 %f419, %f478, %f418;
mov.b32 %r546, %f419;
mov.u32 %r547, 8;
shfl.sync.bfly.b32 %r548|%p171, %r546, %r547, %r542, %r544;
mov.b32 %f420, %r548;
add.f32 %f421, %f419, %f420;
mov.b32 %r549, %f421;
mov.u32 %r550, 4;
shfl.sync.bfly.b32 %r551|%p172, %r549, %r550, %r542, %r544;
mov.b32 %f422, %r551;
add.f32 %f423, %f421, %f422;
mov.b32 %r552, %f423;
mov.u32 %r553, 2;
shfl.sync.bfly.b32 %r554|%p173, %r552, %r553, %r542, %r544;
mov.b32 %f424, %r554;
add.f32 %f425, %f423, %f424;
mov.b32 %r555, %f425;
mov.u32 %r556, 1;
shfl.sync.bfly.b32 %r557|%p174, %r555, %r556, %r542, %r544;
mov.b32 %f426, %r557;
add.f32 %f479, %f425, %f426;
$L__BB0_180:
bar.sync 0;
setp.ne.s32 %p175, %r6, 0;
@%p175 bra $L__BB0_183;
mul.lo.s32 %r116, %r78, %r585;
add.s32 %r558, %r77, %r116;
setp.ge.s32 %p176, %r558, %r155;
@%p176 bra $L__BB0_183;
ld.param.u64 %rd135, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r561, %r79, %r116;
mul.wide.s32 %rd133, %r561, 4;
add.s64 %rd132, %rd135, %rd133;
add.f32 %f427, %f479, 0f00000000;
mov.b32 %r562, %f427;
selp.b32 %r560, %r562, 0, %p162;
// begin inline asm
st.global.cs.v2.s32 [%rd132], {%r115,%r560};
// end inline asm
bra.uni $L__BB0_183;
$L__BB0_137:
setp.lt.s32 %p108, %r74, 1;
@%p108 bra $L__BB0_161;
div.s32 %r84, %r75, %r3;
mad.lo.s32 %r85, %r155, %r6, %r76;
shl.b32 %r86, %r71, 1;
shl.b32 %r87, %r9, 1;
mul.lo.s32 %r88, %r155, %r3;
mov.u32 %r586, 0;
$L__BB0_139:
.pragma "nounroll";
setp.lt.s32 %p109, %r84, 1;
mov.f32 %f466, 0f00000000;
mov.f32 %f467, %f466;
@%p109 bra $L__BB0_145;
mad.lo.s32 %r90, %r78, %r586, %r77;
mad.lo.s32 %r399, %r87, %r586, %r86;
mad.lo.s32 %r588, %r4, %r399, %r85;
mov.u32 %r398, 0;
mov.f32 %f466, 0f00000000;
mov.u32 %r587, %r6;
mov.u32 %r589, %r398;
$L__BB0_141:
.pragma "nounroll";
setp.ge.s32 %p110, %r90, %r155;
mov.u32 %r590, %r398;
mov.u32 %r591, %r398;
@%p110 bra $L__BB0_144;
setp.ge.s32 %p111, %r587, %r9;
mov.u32 %r590, %r398;
mov.u32 %r591, %r398;
@%p111 bra $L__BB0_144;
mul.wide.s32 %rd127, %r588, 4;
add.s64 %rd126, %rd41, %rd127;
// begin inline asm
ld.volatile.global.v2.s32 {%r591,%r590}, [%rd126];
// end inline asm
$L__BB0_144:
mov.b32 %f340, %r591;
add.f32 %f467, %f467, %f340;
mov.b32 %f341, %r590;
add.f32 %f466, %f466, %f341;
add.s32 %r588, %r588, %r88;
add.s32 %r587, %r587, %r3;
add.s32 %r589, %r589, 1;
setp.lt.s32 %p112, %r589, %r84;
@%p112 bra $L__BB0_141;
$L__BB0_145:
mov.b32 %r406, %f467;
mov.u32 %r407, 31;
mov.u32 %r408, 16;
mov.u32 %r409, -1;
shfl.sync.bfly.b32 %r410|%p113, %r406, %r408, %r407, %r409;
mov.b32 %f342, %r410;
add.f32 %f343, %f467, %f342;
mov.b32 %r411, %f343;
mov.u32 %r412, 8;
shfl.sync.bfly.b32 %r413|%p114, %r411, %r412, %r407, %r409;
mov.b32 %f344, %r413;
add.f32 %f345, %f343, %f344;
mov.b32 %r414, %f345;
mov.u32 %r415, 4;
shfl.sync.bfly.b32 %r416|%p115, %r414, %r415, %r407, %r409;
mov.b32 %f346, %r416;
add.f32 %f347, %f345, %f346;
mov.b32 %r417, %f347;
mov.u32 %r418, 2;
shfl.sync.bfly.b32 %r419|%p116, %r417, %r418, %r407, %r409;
mov.b32 %f348, %r419;
add.f32 %f349, %f347, %f348;
mov.b32 %r420, %f349;
mov.u32 %r421, 1;
shfl.sync.bfly.b32 %r422|%p117, %r420, %r421, %r407, %r409;
mov.b32 %f350, %r422;
add.f32 %f469, %f349, %f350;
bar.sync 0;
setp.ne.s32 %p118, %r82, 0;
@%p118 bra $L__BB0_147;
st.shared.f32 [%rd29], %f469;
$L__BB0_147:
setp.ne.s32 %p119, %r81, 0;
bar.sync 0;
@%p119 bra $L__BB0_151;
setp.ge.u32 %p120, %r82, %r80;
mov.f32 %f468, 0f00000000;
@%p120 bra $L__BB0_150;
ld.shared.f32 %f468, [%rd30];
$L__BB0_150:
mov.b32 %r423, %f468;
mov.u32 %r424, 31;
mov.u32 %r425, 16;
mov.u32 %r426, -1;
shfl.sync.bfly.b32 %r427|%p121, %r423, %r425, %r424, %r426;
mov.b32 %f352, %r427;
add.f32 %f353, %f468, %f352;
mov.b32 %r428, %f353;
mov.u32 %r429, 8;
shfl.sync.bfly.b32 %r430|%p122, %r428, %r429, %r424, %r426;
mov.b32 %f354, %r430;
add.f32 %f355, %f353, %f354;
mov.b32 %r431, %f355;
mov.u32 %r432, 4;
shfl.sync.bfly.b32 %r433|%p123, %r431, %r432, %r424, %r426;
mov.b32 %f356, %r433;
add.f32 %f357, %f355, %f356;
mov.b32 %r434, %f357;
mov.u32 %r435, 2;
shfl.sync.bfly.b32 %r436|%p124, %r434, %r435, %r424, %r426;
mov.b32 %f358, %r436;
add.f32 %f359, %f357, %f358;
mov.b32 %r437, %f359;
mov.u32 %r438, 1;
shfl.sync.bfly.b32 %r439|%p125, %r437, %r438, %r424, %r426;
mov.b32 %f360, %r439;
add.f32 %f469, %f359, %f360;
$L__BB0_151:
add.f32 %f361, %f469, 0f00000000;
mov.b32 %r440, %f361;
setp.eq.s32 %p127, %r82, 0;
selp.b32 %r102, %r440, 0, %p127;
bar.sync 0;
mov.b32 %r441, %f466;
mov.u32 %r442, 31;
mov.u32 %r443, 16;
mov.u32 %r444, -1;
shfl.sync.bfly.b32 %r445|%p128, %r441, %r443, %r442, %r444;
mov.b32 %f362, %r445;
add.f32 %f363, %f466, %f362;
mov.b32 %r446, %f363;
mov.u32 %r447, 8;
shfl.sync.bfly.b32 %r448|%p129, %r446, %r447, %r442, %r444;
mov.b32 %f364, %r448;
add.f32 %f365, %f363, %f364;
mov.b32 %r449, %f365;
mov.u32 %r450, 4;
shfl.sync.bfly.b32 %r451|%p130, %r449, %r450, %r442, %r444;
mov.b32 %f366, %r451;
add.f32 %f367, %f365, %f366;
mov.b32 %r452, %f367;
mov.u32 %r453, 2;
shfl.sync.bfly.b32 %r454|%p131, %r452, %r453, %r442, %r444;
mov.b32 %f368, %r454;
add.f32 %f369, %f367, %f368;
mov.b32 %r455, %f369;
mov.u32 %r456, 1;
shfl.sync.bfly.b32 %r457|%p132, %r455, %r456, %r442, %r444;
mov.b32 %f370, %r457;
add.f32 %f471, %f369, %f370;
bar.sync 0;
@%p118 bra $L__BB0_153;
st.shared.f32 [%rd29], %f471;
$L__BB0_153:
bar.sync 0;
@%p119 bra $L__BB0_157;
setp.ge.u32 %p134, %r82, %r80;
mov.f32 %f470, 0f00000000;
@%p134 bra $L__BB0_156;
ld.shared.f32 %f470, [%rd30];
$L__BB0_156:
mov.b32 %r458, %f470;
mov.u32 %r459, 31;
mov.u32 %r460, 16;
mov.u32 %r461, -1;
shfl.sync.bfly.b32 %r462|%p135, %r458, %r460, %r459, %r461;
mov.b32 %f372, %r462;
add.f32 %f373, %f470, %f372;
mov.b32 %r463, %f373;
mov.u32 %r464, 8;
shfl.sync.bfly.b32 %r465|%p136, %r463, %r464, %r459, %r461;
mov.b32 %f374, %r465;
add.f32 %f375, %f373, %f374;
mov.b32 %r466, %f375;
mov.u32 %r467, 4;
shfl.sync.bfly.b32 %r468|%p137, %r466, %r467, %r459, %r461;
mov.b32 %f376, %r468;
add.f32 %f377, %f375, %f376;
mov.b32 %r469, %f377;
mov.u32 %r470, 2;
shfl.sync.bfly.b32 %r471|%p138, %r469, %r470, %r459, %r461;
mov.b32 %f378, %r471;
add.f32 %f379, %f377, %f378;
mov.b32 %r472, %f379;
mov.u32 %r473, 1;
shfl.sync.bfly.b32 %r474|%p139, %r472, %r473, %r459, %r461;
mov.b32 %f380, %r474;
add.f32 %f471, %f379, %f380;
$L__BB0_157:
bar.sync 0;
setp.ne.s32 %p140, %r6, 0;
@%p140 bra $L__BB0_160;
mul.lo.s32 %r103, %r78, %r586;
add.s32 %r475, %r77, %r103;
setp.ge.s32 %p141, %r475, %r155;
@%p141 bra $L__BB0_160;
ld.param.u64 %rd134, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_18_cu_25ea9055_723310nvfuser_18ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r478, %r79, %r103;
mul.wide.s32 %rd129, %r478, 4;
add.s64 %rd128, %rd134, %rd129;
add.f32 %f381, %f471, 0f00000000;
mov.b32 %r479, %f381;
selp.b32 %r477, %r479, 0, %p127;
// begin inline asm
st.global.cs.v2.s32 [%rd128], {%r102,%r477};
// end inline asm
$L__BB0_160:
add.s32 %r586, %r586, 1;
setp.lt.s32 %p143, %r586, %r74;
@%p143 bra $L__BB0_139;
$L__BB0_161:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -30,11 +30,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<184>;
.reg .f32 %f<480>;
- .reg .b32 %r<597>;
+ .reg .b32 %r<595>;
.reg .f64 %fd<3>;
.reg .b64 %rd<137>;
ld.param.v2.u32 {%r154, %r155}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
@@ -50,136 +50,136 @@
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r190, %r155, 3;
shr.s32 %r191, %r190, 31;
shr.u32 %r192, %r191, 30;
add.s32 %r193, %r190, %r192;
- shr.s32 %r194, %r193, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r195, %r194, %r2;
- add.s32 %r196, %r195, 31;
- shr.s32 %r197, %r196, 31;
- shr.u32 %r198, %r197, 27;
- add.s32 %r199, %r196, %r198;
- shr.u32 %r200, %r199, 5;
- mov.u32 %r3, %ntid.y;
- mul.lo.s32 %r201, %r3, %r200;
- shl.b32 %r202, %r201, 7;
- cvt.u64.u32 %rd1, %r202;
- mul.lo.s32 %r203, %r3, %r194;
- shl.b32 %r204, %r203, 4;
- or.b32 %r205, %r204, 15;
- and.b32 %r4, %r205, -16;
- add.s32 %r206, %r205, %r4;
- and.b32 %r207, %r206, -16;
- cvt.s64.s32 %rd2, %r207;
+ shr.s32 %r2, %r193, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r194, %r2, %r3;
+ add.s32 %r195, %r194, 31;
+ shr.s32 %r196, %r195, 31;
+ shr.u32 %r197, %r196, 27;
+ add.s32 %r198, %r195, %r197;
+ shr.u32 %r199, %r198, 5;
+ mov.u32 %r4, %ntid.y;
+ mul.lo.s32 %r200, %r4, %r199;
+ shl.b32 %r201, %r200, 7;
+ cvt.u64.u32 %rd1, %r201;
+ mul.lo.s32 %r202, %r4, %r2;
+ shl.b32 %r203, %r202, 4;
+ or.b32 %r204, %r203, 15;
+ and.b32 %r5, %r204, -16;
+ add.s32 %r205, %r204, %r5;
+ and.b32 %r206, %r205, -16;
+ cvt.s64.s32 %rd2, %r206;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p6, %r5, %r194;
- shl.b32 %r6, %r5, 2;
- or.b32 %r208, %r6, 3;
- setp.lt.s32 %p7, %r208, %r155;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p6, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r207, %r7, 3;
+ setp.lt.s32 %p7, %r207, %r155;
and.pred %p1, %p7, %p6;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p8, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r209, smem_ptr; }
-
-
- shl.b32 %r212, %r5, 4;
- add.s32 %r210, %r209, %r212;
- mul.wide.s32 %rd48, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r208, smem_ptr; }
+
+
+ shl.b32 %r211, %r6, 4;
+ add.s32 %r209, %r208, %r211;
+ mul.wide.s32 %rd48, %r7, 4;
add.s64 %rd47, %rd37, %rd48;
- mov.u32 %r211, 0;
+ mov.u32 %r210, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r211, 0;
- cp.async.ca.shared.global [%r210], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r210, 0;
+ cp.async.ca.shared.global [%r209], [%rd47], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r213, %r3, 215;
- div.s32 %r214, %r213, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r215, %r8, %r214;
- add.s32 %r216, %r215, -1;
- div.s32 %r9, %r216, %r8;
- setp.gt.s32 %p10, %r9, 0;
+ add.s32 %r212, %r4, 215;
+ div.s32 %r213, %r212, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r214, %r9, %r213;
+ add.s32 %r215, %r214, -1;
+ div.s32 %r10, %r215, %r9;
+ setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r155;
- cvt.s64.s32 %rd49, %r4;
+ cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
- mov.u32 %r218, %ctaid.y;
- mul.lo.s32 %r219, %r9, %r3;
- mul.lo.s32 %r10, %r219, %r218;
- shl.b32 %r220, %r7, 2;
- shl.b32 %r221, %r5, 4;
- mad.lo.s32 %r11, %r220, %r155, %r221;
- mul.lo.s32 %r222, %r155, %r7;
- cvt.s64.s32 %rd53, %r222;
- cvt.s64.s32 %rd54, %r6;
+ mov.u32 %r217, %ctaid.y;
+ mul.lo.s32 %r218, %r10, %r4;
+ mul.lo.s32 %r11, %r218, %r217;
+ mad.lo.s32 %r219, %r2, %r8, %r6;
+ shl.b32 %r12, %r219, 4;
+ mul.lo.s32 %r220, %r155, %r8;
+ cvt.s64.s32 %rd53, %r220;
+ cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r223, %r10, %r155;
- cvt.s64.s32 %rd6, %r223;
- mul.lo.s32 %r12, %r155, %r3;
- mul.lo.s32 %r13, %r9, %r218;
- add.s32 %r14, %r222, %r6;
+ mul.lo.s32 %r221, %r11, %r155;
+ cvt.s64.s32 %rd6, %r221;
+ mul.lo.s32 %r13, %r155, %r4;
+ mul.lo.s32 %r14, %r10, %r217;
+ shl.b32 %r222, %r8, 2;
+ mad.lo.s32 %r223, %r222, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r14, 4;
+ mul.wide.s32 %rd56, %r223, 4;
add.s64 %rd7, %rd55, %rd56;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r224, %tid.z;
- mad.lo.s32 %r225, %r224, %r3, %r7;
- shr.u32 %r15, %r2, 5;
+ mad.lo.s32 %r225, %r224, %r4, %r8;
+ shr.u32 %r15, %r3, 5;
mul.lo.s32 %r226, %r225, %r15;
- shr.u32 %r16, %r5, 5;
+ shr.u32 %r16, %r6, 5;
add.s32 %r227, %r226, %r16;
mul.wide.u32 %rd57, %r227, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
- and.b32 %r17, %r5, 31;
+ and.b32 %r17, %r6, 31;
add.s32 %r228, %r226, %r17;
mul.wide.u32 %rd58, %r228, 4;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
- mul.wide.s32 %rd60, %r6, 4;
+ mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r225, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
- mov.u32 %r565, 0;
+ mov.u32 %r563, 0;
mov.f32 %f438, 0f00000000;
not.pred %p11, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r231, smem_ptr; }
- add.s32 %r232, %r11, %r231;
+ add.s32 %r232, %r231, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r257, smem_ptr; }
- add.s32 %r258, %r11, %r257;
+ add.s32 %r258, %r257, %r12;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
@@ -188,16 +188,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
- mad.lo.s32 %r229, %r565, %r3, %r7;
- add.s32 %r230, %r229, %r10;
+ mad.lo.s32 %r229, %r563, %r4, %r8;
+ add.s32 %r230, %r229, %r11;
setp.gt.s32 %p12, %r230, 215;
@%p12 bra $L__BB0_8;
- mul.lo.s32 %r234, %r12, %r565;
+ mul.lo.s32 %r234, %r13, %r563;
cvt.s64.s32 %rd65, %r234;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd33, %rd68;
@@ -216,53 +216,53 @@
cp.async.wait_all;
@%p11 bra $L__BB0_10;
- add.s32 %r235, %r13, %r565;
- mad.lo.s32 %r236, %r235, %r3, %r7;
+ add.s32 %r235, %r14, %r563;
+ mad.lo.s32 %r236, %r235, %r4, %r8;
setp.lt.s32 %p14, %r236, 216;
@%p14 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
+ ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r245, %r13, %r565;
- mad.lo.s32 %r246, %r245, %r3, %r7;
+ add.s32 %r245, %r14, %r563;
+ mad.lo.s32 %r246, %r245, %r4, %r8;
setp.gt.s32 %p15, %r246, 215;
- mov.u32 %r566, 0;
- mov.u32 %r567, %r566;
- mov.u32 %r568, %r566;
- mov.u32 %r569, %r566;
+ mov.u32 %r564, 0;
+ mov.u32 %r565, %r564;
+ mov.u32 %r566, %r564;
+ mov.u32 %r567, %r564;
@%p15 bra $L__BB0_15;
- ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
+ ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r566, 0;
- mov.u32 %r567, %r566;
- mov.u32 %r568, %r566;
- mov.u32 %r569, %r566;
+ mov.u32 %r564, 0;
+ mov.u32 %r565, %r564;
+ mov.u32 %r566, %r564;
+ mov.u32 %r567, %r564;
$L__BB0_15:
- add.s32 %r255, %r13, %r565;
- mad.lo.s32 %r33, %r255, %r3, %r7;
- mov.b32 %f125, %r569;
+ add.s32 %r255, %r14, %r563;
+ mad.lo.s32 %r33, %r255, %r4, %r8;
+ mov.b32 %f125, %r567;
add.f32 %f455, %f455, %f125;
- mov.b32 %f126, %r568;
+ mov.b32 %f126, %r566;
add.f32 %f454, %f454, %f126;
- mov.b32 %f127, %r567;
+ mov.b32 %f127, %r565;
add.f32 %f453, %f453, %f127;
- mov.b32 %f128, %r566;
+ mov.b32 %f128, %r564;
add.f32 %f452, %f452, %f128;
setp.gt.s32 %p16, %r33, 215;
mov.f32 %f436, 0f00000000;
@%p16 bra $L__BB0_17;
@@ -275,11 +275,11 @@
setp.lt.s32 %p17, %r33, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_19;
- mul.lo.s32 %r260, %r12, %r565;
+ mul.lo.s32 %r260, %r13, %r563;
cvt.s64.s32 %rd73, %r260;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd34, %rd76;
@@ -412,11 +412,11 @@
shfl.sync.bfly.b32 %r295|%p33, %r293, %r294, %r280, %r282;
mov.b32 %f192, %r295;
add.f32 %f445, %f191, %f192;
$L__BB0_29:
- setp.ne.s32 %p180, %r17, 0;
+ setp.ne.s32 %p181, %r17, 0;
bar.sync 0;
mov.b32 %r296, %f442;
mov.u32 %r297, 31;
mov.u32 %r298, 16;
mov.u32 %r299, -1;
@@ -442,21 +442,21 @@
mov.u32 %r311, 1;
shfl.sync.bfly.b32 %r312|%p38, %r310, %r311, %r297, %r299;
mov.b32 %f201, %r312;
add.f32 %f447, %f200, %f201;
bar.sync 0;
- @%p180 bra $L__BB0_31;
+ @%p181 bra $L__BB0_31;
st.shared.f32 [%rd8], %f447;
$L__BB0_31:
setp.eq.s32 %p183, %r17, 0;
- setp.ne.s32 %p181, %r16, 0;
+ setp.ne.s32 %p178, %r16, 0;
bar.sync 0;
add.f32 %f202, %f445, 0f00000000;
selp.f32 %f37, %f202, 0f00000000, %p183;
- @%p181 bra $L__BB0_35;
+ @%p178 bra $L__BB0_35;
setp.ge.u32 %p41, %r17, %r15;
mov.f32 %f446, 0f00000000;
@%p41 bra $L__BB0_34;
@@ -491,11 +491,11 @@
mov.b32 %f212, %r329;
add.f32 %f447, %f211, %f212;
$L__BB0_35:
bar.sync 0;
- setp.ne.s32 %p47, %r5, 0;
+ setp.ne.s32 %p47, %r6, 0;
@%p47 bra $L__BB0_37;
st.shared.f32 [%rd12], %f37;
$L__BB0_37:
@@ -553,21 +553,20 @@
sub.f32 %f268, %f264, %f42;
mul.f32 %f269, %f43, %f267;
sub.f32 %f270, %f268, %f269;
mul.f32 %f271, %f215, %f270;
mov.b32 %r333, %f271;
- mad.lo.s32 %r334, %r565, %r3, %r10;
- mad.lo.s32 %r335, %r334, %r155, %r14;
- mul.wide.s32 %rd80, %r335, 4;
+ mad.lo.s32 %r334, %r33, %r155, %r7;
+ mul.wide.s32 %rd80, %r334, 4;
add.s64 %rd79, %rd38, %rd80;
st.global.cs.v4.s32 [%rd79], {%r330,%r331,%r332,%r333};
$L__BB0_41:
- add.s32 %r565, %r565, 1;
- setp.lt.s32 %p51, %r565, %r9;
+ add.s32 %r563, %r563, 1;
+ setp.lt.s32 %p51, %r563, %r10;
@%p51 bra $L__BB0_5;
bra.uni $L__BB0_42;
$L__BB0_3:
mov.f32 %f438, 0f00000000;
@@ -578,31 +577,31 @@
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_42:
- mov.u32 %r336, %tid.z;
- mad.lo.s32 %r35, %r336, %r3, %r7;
- mad.lo.s32 %r36, %r35, %r2, %r5;
+ mov.u32 %r335, %tid.z;
+ mad.lo.s32 %r35, %r335, %r4, %r8;
+ mad.lo.s32 %r36, %r35, %r3, %r6;
mul.wide.u32 %rd81, %r36, 4;
add.s64 %rd22, %rd44, %rd81;
- clz.b32 %r337, %r3;
- mov.u32 %r338, 31;
- sub.s32 %r339, %r338, %r337;
- mov.u32 %r340, 1;
- shl.b32 %r37, %r340, %r339;
- setp.lt.u32 %p52, %r7, %r37;
- add.s32 %r341, %r37, %r7;
- setp.lt.u32 %p53, %r341, %r3;
+ clz.b32 %r336, %r4;
+ mov.u32 %r337, 31;
+ sub.s32 %r338, %r337, %r336;
+ mov.u32 %r339, 1;
+ shl.b32 %r37, %r339, %r338;
+ setp.lt.u32 %p52, %r8, %r37;
+ add.s32 %r340, %r37, %r8;
+ setp.lt.u32 %p53, %r340, %r4;
and.pred %p5, %p52, %p53;
- shl.b32 %r342, %r2, %r339;
- add.s32 %r343, %r36, %r342;
- mul.wide.s32 %rd83, %r343, 4;
+ shl.b32 %r341, %r3, %r338;
+ add.s32 %r342, %r36, %r341;
+ mul.wide.s32 %rd83, %r342, 4;
add.s64 %rd23, %rd44, %rd83;
- shr.u32 %r344, %r37, 31;
- add.s32 %r345, %r37, %r344;
- shr.s32 %r584, %r345, 1;
+ shr.u32 %r343, %r37, 31;
+ add.s32 %r344, %r37, %r343;
+ shr.s32 %r582, %r344, 1;
st.shared.f32 [%rd22], %f438;
bar.sync 0;
not.pred %p54, %p5;
@%p54 bra $L__BB0_44;
@@ -614,49 +613,49 @@
$L__BB0_44:
setp.lt.s32 %p55, %r37, 4;
bar.sync 0;
@%p55 bra $L__BB0_49;
- mov.u32 %r570, %r584;
+ mov.u32 %r568, %r582;
$L__BB0_46:
- setp.ge.u32 %p56, %r7, %r570;
+ setp.ge.u32 %p56, %r8, %r568;
@%p56 bra $L__BB0_48;
- mad.lo.s32 %r346, %r570, %r2, %r36;
- mul.wide.s32 %rd84, %r346, 4;
+ mad.lo.s32 %r345, %r568, %r3, %r36;
+ mul.wide.s32 %rd84, %r345, 4;
add.s64 %rd86, %rd44, %rd84;
ld.shared.f32 %f275, [%rd22];
ld.shared.f32 %f276, [%rd86];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd22], %f277;
$L__BB0_48:
bar.sync 0;
- shr.u32 %r40, %r570, 1;
- setp.gt.u32 %p57, %r570, 3;
- mov.u32 %r570, %r40;
+ shr.u32 %r40, %r568, 1;
+ setp.gt.u32 %p57, %r568, 3;
+ mov.u32 %r568, %r40;
@%p57 bra $L__BB0_46;
$L__BB0_49:
- add.s32 %r348, %r36, %r2;
- mul.wide.u32 %rd87, %r348, 4;
+ add.s32 %r347, %r36, %r3;
+ mul.wide.u32 %rd87, %r347, 4;
add.s64 %rd24, %rd44, %rd87;
- setp.ne.s32 %p58, %r7, 0;
- mov.u32 %r571, 0;
+ setp.ne.s32 %p58, %r8, 0;
+ mov.u32 %r569, 0;
@%p58 bra $L__BB0_53;
- setp.lt.u32 %p59, %r3, 2;
+ setp.lt.u32 %p59, %r4, 2;
ld.shared.f32 %f278, [%rd22];
add.f32 %f456, %f278, 0f00000000;
@%p59 bra $L__BB0_52;
ld.shared.f32 %f279, [%rd24];
add.f32 %f456, %f456, %f279;
$L__BB0_52:
- mov.b32 %r571, %f456;
+ mov.b32 %r569, %f456;
$L__BB0_53:
bar.sync 0;
st.shared.f32 [%rd22], %f439;
bar.sync 0;
@@ -669,45 +668,45 @@
$L__BB0_55:
bar.sync 0;
@%p55 bra $L__BB0_60;
- mov.u32 %r572, %r584;
+ mov.u32 %r570, %r582;
$L__BB0_57:
- setp.ge.u32 %p62, %r7, %r572;
+ setp.ge.u32 %p62, %r8, %r570;
@%p62 bra $L__BB0_59;
- mad.lo.s32 %r349, %r572, %r2, %r36;
- mul.wide.s32 %rd89, %r349, 4;
+ mad.lo.s32 %r348, %r570, %r3, %r36;
+ mul.wide.s32 %rd89, %r348, 4;
add.s64 %rd91, %rd44, %rd89;
ld.shared.f32 %f283, [%rd22];
ld.shared.f32 %f284, [%rd91];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd22], %f285;
$L__BB0_59:
bar.sync 0;
- shr.u32 %r44, %r572, 1;
- setp.gt.u32 %p63, %r572, 3;
- mov.u32 %r572, %r44;
+ shr.u32 %r44, %r570, 1;
+ setp.gt.u32 %p63, %r570, 3;
+ mov.u32 %r570, %r44;
@%p63 bra $L__BB0_57;
$L__BB0_60:
- mov.u32 %r573, 0;
+ mov.u32 %r571, 0;
@%p58 bra $L__BB0_64;
- setp.lt.u32 %p65, %r3, 2;
+ setp.lt.u32 %p65, %r4, 2;
ld.shared.f32 %f286, [%rd22];
add.f32 %f457, %f286, 0f00000000;
@%p65 bra $L__BB0_63;
ld.shared.f32 %f287, [%rd24];
add.f32 %f457, %f457, %f287;
$L__BB0_63:
- mov.b32 %r573, %f457;
+ mov.b32 %r571, %f457;
$L__BB0_64:
bar.sync 0;
st.shared.f32 [%rd22], %f440;
bar.sync 0;
@@ -720,45 +719,45 @@
$L__BB0_66:
bar.sync 0;
@%p55 bra $L__BB0_71;
- mov.u32 %r574, %r584;
+ mov.u32 %r572, %r582;
$L__BB0_68:
- setp.ge.u32 %p68, %r7, %r574;
+ setp.ge.u32 %p68, %r8, %r572;
@%p68 bra $L__BB0_70;
- mad.lo.s32 %r351, %r574, %r2, %r36;
- mul.wide.s32 %rd92, %r351, 4;
+ mad.lo.s32 %r350, %r572, %r3, %r36;
+ mul.wide.s32 %rd92, %r350, 4;
add.s64 %rd94, %rd44, %rd92;
ld.shared.f32 %f291, [%rd22];
ld.shared.f32 %f292, [%rd94];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd22], %f293;
$L__BB0_70:
bar.sync 0;
- shr.u32 %r48, %r574, 1;
- setp.gt.u32 %p69, %r574, 3;
- mov.u32 %r574, %r48;
+ shr.u32 %r48, %r572, 1;
+ setp.gt.u32 %p69, %r572, 3;
+ mov.u32 %r572, %r48;
@%p69 bra $L__BB0_68;
$L__BB0_71:
- mov.u32 %r575, 0;
+ mov.u32 %r573, 0;
@%p58 bra $L__BB0_75;
- setp.lt.u32 %p71, %r3, 2;
+ setp.lt.u32 %p71, %r4, 2;
ld.shared.f32 %f294, [%rd22];
add.f32 %f458, %f294, 0f00000000;
@%p71 bra $L__BB0_74;
ld.shared.f32 %f295, [%rd24];
add.f32 %f458, %f458, %f295;
$L__BB0_74:
- mov.b32 %r575, %f458;
+ mov.b32 %r573, %f458;
$L__BB0_75:
bar.sync 0;
st.shared.f32 [%rd22], %f441;
bar.sync 0;
@@ -771,45 +770,45 @@
$L__BB0_77:
bar.sync 0;
@%p55 bra $L__BB0_82;
- mov.u32 %r576, %r584;
+ mov.u32 %r574, %r582;
$L__BB0_79:
- setp.ge.u32 %p74, %r7, %r576;
+ setp.ge.u32 %p74, %r8, %r574;
@%p74 bra $L__BB0_81;
- mad.lo.s32 %r353, %r576, %r2, %r36;
- mul.wide.s32 %rd95, %r353, 4;
+ mad.lo.s32 %r352, %r574, %r3, %r36;
+ mul.wide.s32 %rd95, %r352, 4;
add.s64 %rd97, %rd44, %rd95;
ld.shared.f32 %f299, [%rd22];
ld.shared.f32 %f300, [%rd97];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd22], %f301;
$L__BB0_81:
bar.sync 0;
- shr.u32 %r52, %r576, 1;
- setp.gt.u32 %p75, %r576, 3;
- mov.u32 %r576, %r52;
+ shr.u32 %r52, %r574, 1;
+ setp.gt.u32 %p75, %r574, 3;
+ mov.u32 %r574, %r52;
@%p75 bra $L__BB0_79;
$L__BB0_82:
- mov.u32 %r577, 0;
+ mov.u32 %r575, 0;
@%p58 bra $L__BB0_86;
- setp.lt.u32 %p77, %r3, 2;
+ setp.lt.u32 %p77, %r4, 2;
ld.shared.f32 %f302, [%rd22];
add.f32 %f459, %f302, 0f00000000;
@%p77 bra $L__BB0_85;
ld.shared.f32 %f303, [%rd24];
add.f32 %f459, %f459, %f303;
$L__BB0_85:
- mov.b32 %r577, %f459;
+ mov.b32 %r575, %f459;
$L__BB0_86:
bar.sync 0;
st.shared.f32 [%rd22], %f452;
bar.sync 0;
@@ -822,45 +821,45 @@
$L__BB0_88:
bar.sync 0;
@%p55 bra $L__BB0_93;
- mov.u32 %r578, %r584;
+ mov.u32 %r576, %r582;
$L__BB0_90:
- setp.ge.u32 %p80, %r7, %r578;
+ setp.ge.u32 %p80, %r8, %r576;
@%p80 bra $L__BB0_92;
- mad.lo.s32 %r355, %r578, %r2, %r36;
- mul.wide.s32 %rd98, %r355, 4;
+ mad.lo.s32 %r354, %r576, %r3, %r36;
+ mul.wide.s32 %rd98, %r354, 4;
add.s64 %rd100, %rd44, %rd98;
ld.shared.f32 %f307, [%rd22];
ld.shared.f32 %f308, [%rd100];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd22], %f309;
$L__BB0_92:
bar.sync 0;
- shr.u32 %r56, %r578, 1;
- setp.gt.u32 %p81, %r578, 3;
- mov.u32 %r578, %r56;
+ shr.u32 %r56, %r576, 1;
+ setp.gt.u32 %p81, %r576, 3;
+ mov.u32 %r576, %r56;
@%p81 bra $L__BB0_90;
$L__BB0_93:
- mov.u32 %r579, 0;
+ mov.u32 %r577, 0;
@%p58 bra $L__BB0_97;
- setp.lt.u32 %p83, %r3, 2;
+ setp.lt.u32 %p83, %r4, 2;
ld.shared.f32 %f310, [%rd22];
add.f32 %f460, %f310, 0f00000000;
@%p83 bra $L__BB0_96;
ld.shared.f32 %f311, [%rd24];
add.f32 %f460, %f460, %f311;
$L__BB0_96:
- mov.b32 %r579, %f460;
+ mov.b32 %r577, %f460;
$L__BB0_97:
bar.sync 0;
st.shared.f32 [%rd22], %f453;
bar.sync 0;
@@ -873,45 +872,45 @@
$L__BB0_99:
bar.sync 0;
@%p55 bra $L__BB0_104;
- mov.u32 %r580, %r584;
+ mov.u32 %r578, %r582;
$L__BB0_101:
- setp.ge.u32 %p86, %r7, %r580;
+ setp.ge.u32 %p86, %r8, %r578;
@%p86 bra $L__BB0_103;
- mad.lo.s32 %r357, %r580, %r2, %r36;
- mul.wide.s32 %rd101, %r357, 4;
+ mad.lo.s32 %r356, %r578, %r3, %r36;
+ mul.wide.s32 %rd101, %r356, 4;
add.s64 %rd103, %rd44, %rd101;
ld.shared.f32 %f315, [%rd22];
ld.shared.f32 %f316, [%rd103];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd22], %f317;
$L__BB0_103:
bar.sync 0;
- shr.u32 %r60, %r580, 1;
- setp.gt.u32 %p87, %r580, 3;
- mov.u32 %r580, %r60;
+ shr.u32 %r60, %r578, 1;
+ setp.gt.u32 %p87, %r578, 3;
+ mov.u32 %r578, %r60;
@%p87 bra $L__BB0_101;
$L__BB0_104:
- mov.u32 %r581, 0;
+ mov.u32 %r579, 0;
@%p58 bra $L__BB0_108;
- setp.lt.u32 %p89, %r3, 2;
+ setp.lt.u32 %p89, %r4, 2;
ld.shared.f32 %f318, [%rd22];
add.f32 %f461, %f318, 0f00000000;
@%p89 bra $L__BB0_107;
ld.shared.f32 %f319, [%rd24];
add.f32 %f461, %f461, %f319;
$L__BB0_107:
- mov.b32 %r581, %f461;
+ mov.b32 %r579, %f461;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd22], %f454;
bar.sync 0;
@@ -924,45 +923,45 @@
$L__BB0_110:
bar.sync 0;
@%p55 bra $L__BB0_115;
- mov.u32 %r582, %r584;
+ mov.u32 %r580, %r582;
$L__BB0_112:
- setp.ge.u32 %p92, %r7, %r582;
+ setp.ge.u32 %p92, %r8, %r580;
@%p92 bra $L__BB0_114;
- mad.lo.s32 %r359, %r582, %r2, %r36;
- mul.wide.s32 %rd104, %r359, 4;
+ mad.lo.s32 %r358, %r580, %r3, %r36;
+ mul.wide.s32 %rd104, %r358, 4;
add.s64 %rd106, %rd44, %rd104;
ld.shared.f32 %f323, [%rd22];
ld.shared.f32 %f324, [%rd106];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd22], %f325;
$L__BB0_114:
bar.sync 0;
- shr.u32 %r64, %r582, 1;
- setp.gt.u32 %p93, %r582, 3;
- mov.u32 %r582, %r64;
+ shr.u32 %r64, %r580, 1;
+ setp.gt.u32 %p93, %r580, 3;
+ mov.u32 %r580, %r64;
@%p93 bra $L__BB0_112;
$L__BB0_115:
- mov.u32 %r583, 0;
+ mov.u32 %r581, 0;
@%p58 bra $L__BB0_119;
- setp.lt.u32 %p95, %r3, 2;
+ setp.lt.u32 %p95, %r4, 2;
ld.shared.f32 %f326, [%rd22];
add.f32 %f462, %f326, 0f00000000;
@%p95 bra $L__BB0_118;
ld.shared.f32 %f327, [%rd24];
add.f32 %f462, %f462, %f327;
$L__BB0_118:
- mov.b32 %r583, %f462;
+ mov.b32 %r581, %f462;
$L__BB0_119:
bar.sync 0;
st.shared.f32 [%rd22], %f455;
bar.sync 0;
@@ -976,217 +975,216 @@
$L__BB0_121:
bar.sync 0;
@%p55 bra $L__BB0_125;
$L__BB0_122:
- setp.ge.u32 %p98, %r7, %r584;
+ setp.ge.u32 %p98, %r8, %r582;
@%p98 bra $L__BB0_124;
- mad.lo.s32 %r361, %r584, %r2, %r36;
- mul.wide.s32 %rd107, %r361, 4;
+ mad.lo.s32 %r360, %r582, %r3, %r36;
+ mul.wide.s32 %rd107, %r360, 4;
add.s64 %rd109, %rd44, %rd107;
ld.shared.f32 %f331, [%rd22];
ld.shared.f32 %f332, [%rd109];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd22], %f333;
$L__BB0_124:
bar.sync 0;
- shr.u32 %r68, %r584, 1;
- setp.gt.u32 %p99, %r584, 3;
- mov.u32 %r584, %r68;
+ shr.u32 %r68, %r582, 1;
+ setp.gt.u32 %p99, %r582, 3;
+ mov.u32 %r582, %r68;
@%p99 bra $L__BB0_122;
$L__BB0_125:
- mov.u32 %r585, 0;
+ mov.u32 %r583, 0;
@%p58 bra $L__BB0_129;
- setp.lt.u32 %p101, %r3, 2;
+ setp.lt.u32 %p101, %r4, 2;
ld.shared.f32 %f334, [%rd22];
add.f32 %f463, %f334, 0f00000000;
@%p101 bra $L__BB0_128;
ld.shared.f32 %f335, [%rd24];
add.f32 %f463, %f463, %f335;
$L__BB0_128:
- mov.b32 %r585, %f463;
+ mov.b32 %r583, %f463;
$L__BB0_129:
- setp.eq.s32 %p179, %r7, 0;
- and.pred %p178, %p179, %p1;
- bar.sync 0;
- @%p178 bra $L__BB0_130;
+ setp.eq.s32 %p180, %r8, 0;
+ and.pred %p179, %p180, %p1;
+ bar.sync 0;
+ @%p179 bra $L__BB0_130;
bra.uni $L__BB0_131;
$L__BB0_130:
- shl.b32 %r564, %r5, 2;
- mov.u32 %r371, %ctaid.y;
- mad.lo.s32 %r372, %r155, %r371, %r564;
- mul.wide.s32 %rd112, %r372, 4;
+ mov.u32 %r370, %ctaid.y;
+ mad.lo.s32 %r371, %r155, %r370, %r7;
+ mul.wide.s32 %rd112, %r371, 4;
add.s64 %rd110, %rd41, %rd112;
- st.volatile.global.v4.s32 [%rd110], {%r571,%r573,%r575,%r577};
+ st.volatile.global.v4.s32 [%rd110], {%r569,%r571,%r573,%r575};
add.s64 %rd111, %rd42, %rd112;
- st.volatile.global.v4.s32 [%rd111], {%r579,%r581,%r583,%r585};
+ st.volatile.global.v4.s32 [%rd111], {%r577,%r579,%r581,%r583};
$L__BB0_131:
mov.u32 %r71, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r373, %r5, %r7;
- or.b32 %r375, %r373, %r336;
- setp.ne.s32 %p102, %r375, 0;
+ or.b32 %r372, %r6, %r8;
+ or.b32 %r374, %r372, %r335;
+ setp.ne.s32 %p102, %r374, 0;
@%p102 bra $L__BB0_135;
ld.param.u64 %rd136, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd136;
- mov.u32 %r376, %ctaid.x;
- mov.u32 %r377, %ctaid.z;
- mov.u32 %r378, %nctaid.x;
- mad.lo.s32 %r379, %r377, %r378, %r376;
- mul.wide.s32 %rd114, %r379, 8;
+ mov.u32 %r375, %ctaid.x;
+ mov.u32 %r376, %ctaid.z;
+ mov.u32 %r377, %nctaid.x;
+ mad.lo.s32 %r378, %r376, %r377, %r375;
+ mul.wide.s32 %rd114, %r378, 8;
add.s64 %rd27, %rd113, %rd114;
- add.s32 %r380, %r8, -1;
- setp.eq.s32 %p103, %r71, %r380;
- cvt.s64.s32 %rd115, %r8;
+ add.s32 %r379, %r9, -1;
+ setp.eq.s32 %p103, %r71, %r379;
+ cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p103;
atom.global.add.u64 %rd28, [%rd27], %rd118;
ld.volatile.global.u64 %rd119, [%rd27];
xor.b64 %rd120, %rd119, %rd28;
setp.lt.s64 %p104, %rd120, 0;
@%p104 bra $L__BB0_135;
- mov.u32 %r586, 8;
+ mov.u32 %r584, 8;
$L__BB0_134:
- nanosleep.u32 %r586;
-
- setp.lt.u32 %p105, %r586, 256;
- selp.u32 %r383, 1, 0, %p105;
- shl.b32 %r586, %r586, %r383;
+ nanosleep.u32 %r584;
+
+ setp.lt.u32 %p105, %r584, 256;
+ selp.u32 %r382, 1, 0, %p105;
+ shl.b32 %r584, %r584, %r382;
ld.volatile.global.u64 %rd121, [%rd27];
xor.b64 %rd122, %rd121, %rd28;
setp.gt.s64 %p106, %rd122, -1;
@%p106 bra $L__BB0_134;
$L__BB0_135:
bar.sync 0;
- add.s32 %r385, %r155, 1;
- shr.u32 %r386, %r385, 31;
- add.s32 %r387, %r385, %r386;
- shr.s32 %r388, %r387, 1;
- add.s32 %r389, %r3, %r388;
- add.s32 %r390, %r389, -1;
- div.s32 %r391, %r390, %r3;
- add.s32 %r392, %r8, -1;
- add.s32 %r393, %r392, %r391;
- div.s32 %r74, %r393, %r8;
- add.s32 %r75, %r392, %r2;
- shl.b32 %r76, %r7, 1;
- shl.b32 %r394, %r3, 1;
- mad.lo.s32 %r79, %r394, %r71, %r76;
+ add.s32 %r384, %r155, 1;
+ shr.u32 %r385, %r384, 31;
+ add.s32 %r386, %r384, %r385;
+ shr.s32 %r387, %r386, 1;
+ add.s32 %r388, %r4, %r387;
+ add.s32 %r389, %r388, -1;
+ div.s32 %r390, %r389, %r4;
+ add.s32 %r391, %r9, -1;
+ add.s32 %r392, %r391, %r390;
+ div.s32 %r74, %r392, %r9;
+ add.s32 %r75, %r391, %r3;
+ shl.b32 %r76, %r8, 1;
+ shl.b32 %r393, %r4, 1;
+ mad.lo.s32 %r79, %r393, %r71, %r76;
or.b32 %r77, %r79, 1;
- mul.lo.s32 %r78, %r394, %r8;
- shr.u32 %r80, %r2, 5;
- mul.lo.s32 %r395, %r35, %r80;
- shr.u32 %r81, %r5, 5;
- add.s32 %r396, %r395, %r81;
- mul.wide.u32 %rd123, %r396, 4;
+ mul.lo.s32 %r78, %r393, %r9;
+ shr.u32 %r80, %r3, 5;
+ mul.lo.s32 %r394, %r35, %r80;
+ shr.u32 %r81, %r6, 5;
+ add.s32 %r395, %r394, %r81;
+ mul.wide.u32 %rd123, %r395, 4;
add.s64 %rd29, %rd44, %rd123;
- and.b32 %r82, %r5, 31;
- add.s32 %r397, %r395, %r82;
- mul.wide.u32 %rd125, %r397, 4;
+ and.b32 %r82, %r6, 31;
+ add.s32 %r396, %r394, %r82;
+ mul.wide.u32 %rd125, %r396, 4;
add.s64 %rd30, %rd44, %rd125;
- mov.u32 %r587, 0;
+ mov.u32 %r585, 0;
bra.uni $L__BB0_136;
$L__BB0_183:
- add.s32 %r587, %r587, 1;
+ add.s32 %r585, %r585, 1;
$L__BB0_136:
.pragma "nounroll";
- setp.lt.s32 %p107, %r587, %r74;
+ setp.lt.s32 %p107, %r585, %r74;
@%p107 bra $L__BB0_162;
bra.uni $L__BB0_137;
$L__BB0_162:
- div.s32 %r105, %r75, %r2;
+ div.s32 %r105, %r75, %r3;
setp.lt.s32 %p144, %r105, 1;
mov.f32 %f474, 0f00000000;
mov.f32 %f475, %f474;
@%p144 bra $L__BB0_168;
- mul.lo.s32 %r482, %r78, %r587;
- add.s32 %r106, %r77, %r482;
- add.s32 %r107, %r79, %r482;
- mov.u32 %r481, 0;
+ mul.lo.s32 %r481, %r78, %r585;
+ add.s32 %r106, %r77, %r481;
+ add.s32 %r107, %r79, %r481;
+ mov.u32 %r480, 0;
mov.f32 %f474, 0f00000000;
- mov.u32 %r594, %r481;
+ mov.u32 %r592, %r480;
$L__BB0_164:
.pragma "nounroll";
setp.ge.s32 %p145, %r106, %r155;
- mov.u32 %r595, %r481;
- mov.u32 %r596, %r481;
+ mov.u32 %r593, %r480;
+ mov.u32 %r594, %r480;
@%p145 bra $L__BB0_167;
- mad.lo.s32 %r109, %r594, %r2, %r5;
- setp.ge.s32 %p146, %r109, %r8;
- mov.u32 %r595, %r481;
- mov.u32 %r596, %r481;
+ mad.lo.s32 %r109, %r592, %r3, %r6;
+ setp.ge.s32 %p146, %r109, %r9;
+ mov.u32 %r593, %r480;
+ mov.u32 %r594, %r480;
@%p146 bra $L__BB0_167;
- mad.lo.s32 %r489, %r109, %r155, %r107;
- mul.wide.s32 %rd131, %r489, 4;
+ mad.lo.s32 %r488, %r109, %r155, %r107;
+ mul.wide.s32 %rd131, %r488, 4;
add.s64 %rd130, %rd42, %rd131;
- ld.volatile.global.v2.s32 {%r596,%r595}, [%rd130];
+ ld.volatile.global.v2.s32 {%r594,%r593}, [%rd130];
$L__BB0_167:
- mov.b32 %f386, %r596;
+ mov.b32 %f386, %r594;
add.f32 %f475, %f475, %f386;
- mov.b32 %f387, %r595;
+ mov.b32 %f387, %r593;
add.f32 %f474, %f474, %f387;
- add.s32 %r594, %r594, 1;
- setp.lt.s32 %p147, %r594, %r105;
+ add.s32 %r592, %r592, 1;
+ setp.lt.s32 %p147, %r592, %r105;
@%p147 bra $L__BB0_164;
$L__BB0_168:
- mov.b32 %r490, %f475;
- mov.u32 %r491, 31;
- mov.u32 %r492, 16;
- mov.u32 %r493, -1;
- shfl.sync.bfly.b32 %r494|%p148, %r490, %r492, %r491, %r493;
- mov.b32 %f388, %r494;
+ mov.b32 %r489, %f475;
+ mov.u32 %r490, 31;
+ mov.u32 %r491, 16;
+ mov.u32 %r492, -1;
+ shfl.sync.bfly.b32 %r493|%p148, %r489, %r491, %r490, %r492;
+ mov.b32 %f388, %r493;
add.f32 %f389, %f475, %f388;
- mov.b32 %r495, %f389;
- mov.u32 %r496, 8;
- shfl.sync.bfly.b32 %r497|%p149, %r495, %r496, %r491, %r493;
- mov.b32 %f390, %r497;
+ mov.b32 %r494, %f389;
+ mov.u32 %r495, 8;
+ shfl.sync.bfly.b32 %r496|%p149, %r494, %r495, %r490, %r492;
+ mov.b32 %f390, %r496;
add.f32 %f391, %f389, %f390;
- mov.b32 %r498, %f391;
- mov.u32 %r499, 4;
- shfl.sync.bfly.b32 %r500|%p150, %r498, %r499, %r491, %r493;
- mov.b32 %f392, %r500;
+ mov.b32 %r497, %f391;
+ mov.u32 %r498, 4;
+ shfl.sync.bfly.b32 %r499|%p150, %r497, %r498, %r490, %r492;
+ mov.b32 %f392, %r499;
add.f32 %f393, %f391, %f392;
- mov.b32 %r501, %f393;
- mov.u32 %r502, 2;
- shfl.sync.bfly.b32 %r503|%p151, %r501, %r502, %r491, %r493;
- mov.b32 %f394, %r503;
+ mov.b32 %r500, %f393;
+ mov.u32 %r501, 2;
+ shfl.sync.bfly.b32 %r502|%p151, %r500, %r501, %r490, %r492;
+ mov.b32 %f394, %r502;
add.f32 %f395, %f393, %f394;
- mov.b32 %r504, %f395;
- mov.u32 %r505, 1;
- shfl.sync.bfly.b32 %r506|%p152, %r504, %r505, %r491, %r493;
- mov.b32 %f396, %r506;
+ mov.b32 %r503, %f395;
+ mov.u32 %r504, 1;
+ shfl.sync.bfly.b32 %r505|%p152, %r503, %r504, %r490, %r492;
+ mov.b32 %f396, %r505;
add.f32 %f477, %f395, %f396;
bar.sync 0;
setp.ne.s32 %p153, %r82, 0;
@%p153 bra $L__BB0_170;
@@ -1202,70 +1200,70 @@
@%p155 bra $L__BB0_173;
ld.shared.f32 %f476, [%rd30];
$L__BB0_173:
- mov.b32 %r507, %f476;
- mov.u32 %r508, 31;
- mov.u32 %r509, 16;
- mov.u32 %r510, -1;
- shfl.sync.bfly.b32 %r511|%p156, %r507, %r509, %r508, %r510;
- mov.b32 %f398, %r511;
+ mov.b32 %r506, %f476;
+ mov.u32 %r507, 31;
+ mov.u32 %r508, 16;
+ mov.u32 %r509, -1;
+ shfl.sync.bfly.b32 %r510|%p156, %r506, %r508, %r507, %r509;
+ mov.b32 %f398, %r510;
add.f32 %f399, %f476, %f398;
- mov.b32 %r512, %f399;
- mov.u32 %r513, 8;
- shfl.sync.bfly.b32 %r514|%p157, %r512, %r513, %r508, %r510;
- mov.b32 %f400, %r514;
+ mov.b32 %r511, %f399;
+ mov.u32 %r512, 8;
+ shfl.sync.bfly.b32 %r513|%p157, %r511, %r512, %r507, %r509;
+ mov.b32 %f400, %r513;
add.f32 %f401, %f399, %f400;
- mov.b32 %r515, %f401;
- mov.u32 %r516, 4;
- shfl.sync.bfly.b32 %r517|%p158, %r515, %r516, %r508, %r510;
- mov.b32 %f402, %r517;
+ mov.b32 %r514, %f401;
+ mov.u32 %r515, 4;
+ shfl.sync.bfly.b32 %r516|%p158, %r514, %r515, %r507, %r509;
+ mov.b32 %f402, %r516;
add.f32 %f403, %f401, %f402;
- mov.b32 %r518, %f403;
- mov.u32 %r519, 2;
- shfl.sync.bfly.b32 %r520|%p159, %r518, %r519, %r508, %r510;
- mov.b32 %f404, %r520;
+ mov.b32 %r517, %f403;
+ mov.u32 %r518, 2;
+ shfl.sync.bfly.b32 %r519|%p159, %r517, %r518, %r507, %r509;
+ mov.b32 %f404, %r519;
add.f32 %f405, %f403, %f404;
- mov.b32 %r521, %f405;
- mov.u32 %r522, 1;
- shfl.sync.bfly.b32 %r523|%p160, %r521, %r522, %r508, %r510;
- mov.b32 %f406, %r523;
+ mov.b32 %r520, %f405;
+ mov.u32 %r521, 1;
+ shfl.sync.bfly.b32 %r522|%p160, %r520, %r521, %r507, %r509;
+ mov.b32 %f406, %r522;
add.f32 %f477, %f405, %f406;
$L__BB0_174:
add.f32 %f407, %f477, 0f00000000;
- mov.b32 %r524, %f407;
+ mov.b32 %r523, %f407;
setp.eq.s32 %p162, %r82, 0;
- selp.b32 %r115, %r524, 0, %p162;
- bar.sync 0;
- mov.b32 %r525, %f474;
- mov.u32 %r526, 31;
- mov.u32 %r527, 16;
- mov.u32 %r528, -1;
- shfl.sync.bfly.b32 %r529|%p163, %r525, %r527, %r526, %r528;
- mov.b32 %f408, %r529;
+ selp.b32 %r115, %r523, 0, %p162;
+ bar.sync 0;
+ mov.b32 %r524, %f474;
+ mov.u32 %r525, 31;
+ mov.u32 %r526, 16;
+ mov.u32 %r527, -1;
+ shfl.sync.bfly.b32 %r528|%p163, %r524, %r526, %r525, %r527;
+ mov.b32 %f408, %r528;
add.f32 %f409, %f474, %f408;
- mov.b32 %r530, %f409;
- mov.u32 %r531, 8;
- shfl.sync.bfly.b32 %r532|%p164, %r530, %r531, %r526, %r528;
- mov.b32 %f410, %r532;
+ mov.b32 %r529, %f409;
+ mov.u32 %r530, 8;
+ shfl.sync.bfly.b32 %r531|%p164, %r529, %r530, %r525, %r527;
+ mov.b32 %f410, %r531;
add.f32 %f411, %f409, %f410;
- mov.b32 %r533, %f411;
- mov.u32 %r534, 4;
- shfl.sync.bfly.b32 %r535|%p165, %r533, %r534, %r526, %r528;
- mov.b32 %f412, %r535;
+ mov.b32 %r532, %f411;
+ mov.u32 %r533, 4;
+ shfl.sync.bfly.b32 %r534|%p165, %r532, %r533, %r525, %r527;
+ mov.b32 %f412, %r534;
add.f32 %f413, %f411, %f412;
- mov.b32 %r536, %f413;
- mov.u32 %r537, 2;
- shfl.sync.bfly.b32 %r538|%p166, %r536, %r537, %r526, %r528;
- mov.b32 %f414, %r538;
+ mov.b32 %r535, %f413;
+ mov.u32 %r536, 2;
+ shfl.sync.bfly.b32 %r537|%p166, %r535, %r536, %r525, %r527;
+ mov.b32 %f414, %r537;
add.f32 %f415, %f413, %f414;
- mov.b32 %r539, %f415;
- mov.u32 %r540, 1;
- shfl.sync.bfly.b32 %r541|%p167, %r539, %r540, %r526, %r528;
- mov.b32 %f416, %r541;
+ mov.b32 %r538, %f415;
+ mov.u32 %r539, 1;
+ shfl.sync.bfly.b32 %r540|%p167, %r538, %r539, %r525, %r527;
+ mov.b32 %f416, %r540;
add.f32 %f479, %f415, %f416;
bar.sync 0;
@%p153 bra $L__BB0_176;
st.shared.f32 [%rd29], %f479;
@@ -1279,142 +1277,142 @@
@%p169 bra $L__BB0_179;
ld.shared.f32 %f478, [%rd30];
$L__BB0_179:
- mov.b32 %r542, %f478;
- mov.u32 %r543, 31;
- mov.u32 %r544, 16;
- mov.u32 %r545, -1;
- shfl.sync.bfly.b32 %r546|%p170, %r542, %r544, %r543, %r545;
- mov.b32 %f418, %r546;
+ mov.b32 %r541, %f478;
+ mov.u32 %r542, 31;
+ mov.u32 %r543, 16;
+ mov.u32 %r544, -1;
+ shfl.sync.bfly.b32 %r545|%p170, %r541, %r543, %r542, %r544;
+ mov.b32 %f418, %r545;
add.f32 %f419, %f478, %f418;
- mov.b32 %r547, %f419;
- mov.u32 %r548, 8;
- shfl.sync.bfly.b32 %r549|%p171, %r547, %r548, %r543, %r545;
- mov.b32 %f420, %r549;
+ mov.b32 %r546, %f419;
+ mov.u32 %r547, 8;
+ shfl.sync.bfly.b32 %r548|%p171, %r546, %r547, %r542, %r544;
+ mov.b32 %f420, %r548;
add.f32 %f421, %f419, %f420;
- mov.b32 %r550, %f421;
- mov.u32 %r551, 4;
- shfl.sync.bfly.b32 %r552|%p172, %r550, %r551, %r543, %r545;
- mov.b32 %f422, %r552;
+ mov.b32 %r549, %f421;
+ mov.u32 %r550, 4;
+ shfl.sync.bfly.b32 %r551|%p172, %r549, %r550, %r542, %r544;
+ mov.b32 %f422, %r551;
add.f32 %f423, %f421, %f422;
- mov.b32 %r553, %f423;
- mov.u32 %r554, 2;
- shfl.sync.bfly.b32 %r555|%p173, %r553, %r554, %r543, %r545;
- mov.b32 %f424, %r555;
+ mov.b32 %r552, %f423;
+ mov.u32 %r553, 2;
+ shfl.sync.bfly.b32 %r554|%p173, %r552, %r553, %r542, %r544;
+ mov.b32 %f424, %r554;
add.f32 %f425, %f423, %f424;
- mov.b32 %r556, %f425;
- mov.u32 %r557, 1;
- shfl.sync.bfly.b32 %r558|%p174, %r556, %r557, %r543, %r545;
- mov.b32 %f426, %r558;
+ mov.b32 %r555, %f425;
+ mov.u32 %r556, 1;
+ shfl.sync.bfly.b32 %r557|%p174, %r555, %r556, %r542, %r544;
+ mov.b32 %f426, %r557;
add.f32 %f479, %f425, %f426;
$L__BB0_180:
bar.sync 0;
- setp.ne.s32 %p175, %r5, 0;
+ setp.ne.s32 %p175, %r6, 0;
@%p175 bra $L__BB0_183;
- mul.lo.s32 %r116, %r78, %r587;
- add.s32 %r559, %r77, %r116;
- setp.ge.s32 %p176, %r559, %r155;
+ mul.lo.s32 %r116, %r78, %r585;
+ add.s32 %r558, %r77, %r116;
+ setp.ge.s32 %p176, %r558, %r155;
@%p176 bra $L__BB0_183;
ld.param.u64 %rd135, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r562, %r79, %r116;
- mul.wide.s32 %rd133, %r562, 4;
+ add.s32 %r561, %r79, %r116;
+ mul.wide.s32 %rd133, %r561, 4;
add.s64 %rd132, %rd135, %rd133;
add.f32 %f427, %f479, 0f00000000;
- mov.b32 %r563, %f427;
- selp.b32 %r561, %r563, 0, %p162;
-
- st.global.cs.v2.s32 [%rd132], {%r115,%r561};
+ mov.b32 %r562, %f427;
+ selp.b32 %r560, %r562, 0, %p162;
+
+ st.global.cs.v2.s32 [%rd132], {%r115,%r560};
bra.uni $L__BB0_183;
$L__BB0_137:
setp.lt.s32 %p108, %r74, 1;
@%p108 bra $L__BB0_161;
- div.s32 %r84, %r75, %r2;
- mad.lo.s32 %r85, %r155, %r5, %r76;
+ div.s32 %r84, %r75, %r3;
+ mad.lo.s32 %r85, %r155, %r6, %r76;
shl.b32 %r86, %r71, 1;
- shl.b32 %r87, %r8, 1;
- mul.lo.s32 %r88, %r155, %r2;
- mov.u32 %r588, 0;
+ shl.b32 %r87, %r9, 1;
+ mul.lo.s32 %r88, %r155, %r3;
+ mov.u32 %r586, 0;
$L__BB0_139:
.pragma "nounroll";
setp.lt.s32 %p109, %r84, 1;
mov.f32 %f466, 0f00000000;
mov.f32 %f467, %f466;
@%p109 bra $L__BB0_145;
- mad.lo.s32 %r90, %r78, %r588, %r77;
- mad.lo.s32 %r400, %r87, %r588, %r86;
- mad.lo.s32 %r590, %r3, %r400, %r85;
- mov.u32 %r399, 0;
+ mad.lo.s32 %r90, %r78, %r586, %r77;
+ mad.lo.s32 %r399, %r87, %r586, %r86;
+ mad.lo.s32 %r588, %r4, %r399, %r85;
+ mov.u32 %r398, 0;
mov.f32 %f466, 0f00000000;
- mov.u32 %r589, %r5;
- mov.u32 %r591, %r399;
+ mov.u32 %r587, %r6;
+ mov.u32 %r589, %r398;
$L__BB0_141:
.pragma "nounroll";
setp.ge.s32 %p110, %r90, %r155;
- mov.u32 %r592, %r399;
- mov.u32 %r593, %r399;
+ mov.u32 %r590, %r398;
+ mov.u32 %r591, %r398;
@%p110 bra $L__BB0_144;
- setp.ge.s32 %p111, %r589, %r8;
- mov.u32 %r592, %r399;
- mov.u32 %r593, %r399;
+ setp.ge.s32 %p111, %r587, %r9;
+ mov.u32 %r590, %r398;
+ mov.u32 %r591, %r398;
@%p111 bra $L__BB0_144;
- mul.wide.s32 %rd127, %r590, 4;
+ mul.wide.s32 %rd127, %r588, 4;
add.s64 %rd126, %rd41, %rd127;
- ld.volatile.global.v2.s32 {%r593,%r592}, [%rd126];
+ ld.volatile.global.v2.s32 {%r591,%r590}, [%rd126];
$L__BB0_144:
- mov.b32 %f340, %r593;
+ mov.b32 %f340, %r591;
add.f32 %f467, %f467, %f340;
- mov.b32 %f341, %r592;
+ mov.b32 %f341, %r590;
add.f32 %f466, %f466, %f341;
- add.s32 %r590, %r590, %r88;
- add.s32 %r589, %r589, %r2;
- add.s32 %r591, %r591, 1;
- setp.lt.s32 %p112, %r591, %r84;
+ add.s32 %r588, %r588, %r88;
+ add.s32 %r587, %r587, %r3;
+ add.s32 %r589, %r589, 1;
+ setp.lt.s32 %p112, %r589, %r84;
@%p112 bra $L__BB0_141;
$L__BB0_145:
- mov.b32 %r407, %f467;
- mov.u32 %r408, 31;
- mov.u32 %r409, 16;
- mov.u32 %r410, -1;
- shfl.sync.bfly.b32 %r411|%p113, %r407, %r409, %r408, %r410;
- mov.b32 %f342, %r411;
+ mov.b32 %r406, %f467;
+ mov.u32 %r407, 31;
+ mov.u32 %r408, 16;
+ mov.u32 %r409, -1;
+ shfl.sync.bfly.b32 %r410|%p113, %r406, %r408, %r407, %r409;
+ mov.b32 %f342, %r410;
add.f32 %f343, %f467, %f342;
- mov.b32 %r412, %f343;
- mov.u32 %r413, 8;
- shfl.sync.bfly.b32 %r414|%p114, %r412, %r413, %r408, %r410;
- mov.b32 %f344, %r414;
+ mov.b32 %r411, %f343;
+ mov.u32 %r412, 8;
+ shfl.sync.bfly.b32 %r413|%p114, %r411, %r412, %r407, %r409;
+ mov.b32 %f344, %r413;
add.f32 %f345, %f343, %f344;
- mov.b32 %r415, %f345;
- mov.u32 %r416, 4;
- shfl.sync.bfly.b32 %r417|%p115, %r415, %r416, %r408, %r410;
- mov.b32 %f346, %r417;
+ mov.b32 %r414, %f345;
+ mov.u32 %r415, 4;
+ shfl.sync.bfly.b32 %r416|%p115, %r414, %r415, %r407, %r409;
+ mov.b32 %f346, %r416;
add.f32 %f347, %f345, %f346;
- mov.b32 %r418, %f347;
- mov.u32 %r419, 2;
- shfl.sync.bfly.b32 %r420|%p116, %r418, %r419, %r408, %r410;
- mov.b32 %f348, %r420;
+ mov.b32 %r417, %f347;
+ mov.u32 %r418, 2;
+ shfl.sync.bfly.b32 %r419|%p116, %r417, %r418, %r407, %r409;
+ mov.b32 %f348, %r419;
add.f32 %f349, %f347, %f348;
- mov.b32 %r421, %f349;
- mov.u32 %r422, 1;
- shfl.sync.bfly.b32 %r423|%p117, %r421, %r422, %r408, %r410;
- mov.b32 %f350, %r423;
+ mov.b32 %r420, %f349;
+ mov.u32 %r421, 1;
+ shfl.sync.bfly.b32 %r422|%p117, %r420, %r421, %r407, %r409;
+ mov.b32 %f350, %r422;
add.f32 %f469, %f349, %f350;
bar.sync 0;
setp.ne.s32 %p118, %r82, 0;
@%p118 bra $L__BB0_147;
@@ -1430,70 +1428,70 @@
@%p120 bra $L__BB0_150;
ld.shared.f32 %f468, [%rd30];
$L__BB0_150:
- mov.b32 %r424, %f468;
- mov.u32 %r425, 31;
- mov.u32 %r426, 16;
- mov.u32 %r427, -1;
- shfl.sync.bfly.b32 %r428|%p121, %r424, %r426, %r425, %r427;
- mov.b32 %f352, %r428;
+ mov.b32 %r423, %f468;
+ mov.u32 %r424, 31;
+ mov.u32 %r425, 16;
+ mov.u32 %r426, -1;
+ shfl.sync.bfly.b32 %r427|%p121, %r423, %r425, %r424, %r426;
+ mov.b32 %f352, %r427;
add.f32 %f353, %f468, %f352;
- mov.b32 %r429, %f353;
- mov.u32 %r430, 8;
- shfl.sync.bfly.b32 %r431|%p122, %r429, %r430, %r425, %r427;
- mov.b32 %f354, %r431;
+ mov.b32 %r428, %f353;
+ mov.u32 %r429, 8;
+ shfl.sync.bfly.b32 %r430|%p122, %r428, %r429, %r424, %r426;
+ mov.b32 %f354, %r430;
add.f32 %f355, %f353, %f354;
- mov.b32 %r432, %f355;
- mov.u32 %r433, 4;
- shfl.sync.bfly.b32 %r434|%p123, %r432, %r433, %r425, %r427;
- mov.b32 %f356, %r434;
+ mov.b32 %r431, %f355;
+ mov.u32 %r432, 4;
+ shfl.sync.bfly.b32 %r433|%p123, %r431, %r432, %r424, %r426;
+ mov.b32 %f356, %r433;
add.f32 %f357, %f355, %f356;
- mov.b32 %r435, %f357;
- mov.u32 %r436, 2;
- shfl.sync.bfly.b32 %r437|%p124, %r435, %r436, %r425, %r427;
- mov.b32 %f358, %r437;
+ mov.b32 %r434, %f357;
+ mov.u32 %r435, 2;
+ shfl.sync.bfly.b32 %r436|%p124, %r434, %r435, %r424, %r426;
+ mov.b32 %f358, %r436;
add.f32 %f359, %f357, %f358;
- mov.b32 %r438, %f359;
- mov.u32 %r439, 1;
- shfl.sync.bfly.b32 %r440|%p125, %r438, %r439, %r425, %r427;
- mov.b32 %f360, %r440;
+ mov.b32 %r437, %f359;
+ mov.u32 %r438, 1;
+ shfl.sync.bfly.b32 %r439|%p125, %r437, %r438, %r424, %r426;
+ mov.b32 %f360, %r439;
add.f32 %f469, %f359, %f360;
$L__BB0_151:
add.f32 %f361, %f469, 0f00000000;
- mov.b32 %r441, %f361;
+ mov.b32 %r440, %f361;
setp.eq.s32 %p127, %r82, 0;
- selp.b32 %r102, %r441, 0, %p127;
- bar.sync 0;
- mov.b32 %r442, %f466;
- mov.u32 %r443, 31;
- mov.u32 %r444, 16;
- mov.u32 %r445, -1;
- shfl.sync.bfly.b32 %r446|%p128, %r442, %r444, %r443, %r445;
- mov.b32 %f362, %r446;
+ selp.b32 %r102, %r440, 0, %p127;
+ bar.sync 0;
+ mov.b32 %r441, %f466;
+ mov.u32 %r442, 31;
+ mov.u32 %r443, 16;
+ mov.u32 %r444, -1;
+ shfl.sync.bfly.b32 %r445|%p128, %r441, %r443, %r442, %r444;
+ mov.b32 %f362, %r445;
add.f32 %f363, %f466, %f362;
- mov.b32 %r447, %f363;
- mov.u32 %r448, 8;
- shfl.sync.bfly.b32 %r449|%p129, %r447, %r448, %r443, %r445;
- mov.b32 %f364, %r449;
+ mov.b32 %r446, %f363;
+ mov.u32 %r447, 8;
+ shfl.sync.bfly.b32 %r448|%p129, %r446, %r447, %r442, %r444;
+ mov.b32 %f364, %r448;
add.f32 %f365, %f363, %f364;
- mov.b32 %r450, %f365;
- mov.u32 %r451, 4;
- shfl.sync.bfly.b32 %r452|%p130, %r450, %r451, %r443, %r445;
- mov.b32 %f366, %r452;
+ mov.b32 %r449, %f365;
+ mov.u32 %r450, 4;
+ shfl.sync.bfly.b32 %r451|%p130, %r449, %r450, %r442, %r444;
+ mov.b32 %f366, %r451;
add.f32 %f367, %f365, %f366;
- mov.b32 %r453, %f367;
- mov.u32 %r454, 2;
- shfl.sync.bfly.b32 %r455|%p131, %r453, %r454, %r443, %r445;
- mov.b32 %f368, %r455;
+ mov.b32 %r452, %f367;
+ mov.u32 %r453, 2;
+ shfl.sync.bfly.b32 %r454|%p131, %r452, %r453, %r442, %r444;
+ mov.b32 %f368, %r454;
add.f32 %f369, %f367, %f368;
- mov.b32 %r456, %f369;
- mov.u32 %r457, 1;
- shfl.sync.bfly.b32 %r458|%p132, %r456, %r457, %r443, %r445;
- mov.b32 %f370, %r458;
+ mov.b32 %r455, %f369;
+ mov.u32 %r456, 1;
+ shfl.sync.bfly.b32 %r457|%p132, %r455, %r456, %r442, %r444;
+ mov.b32 %f370, %r457;
add.f32 %f471, %f369, %f370;
bar.sync 0;
@%p118 bra $L__BB0_153;
st.shared.f32 [%rd29], %f471;
@@ -1507,62 +1505,62 @@
@%p134 bra $L__BB0_156;
ld.shared.f32 %f470, [%rd30];
$L__BB0_156:
- mov.b32 %r459, %f470;
- mov.u32 %r460, 31;
- mov.u32 %r461, 16;
- mov.u32 %r462, -1;
- shfl.sync.bfly.b32 %r463|%p135, %r459, %r461, %r460, %r462;
- mov.b32 %f372, %r463;
+ mov.b32 %r458, %f470;
+ mov.u32 %r459, 31;
+ mov.u32 %r460, 16;
+ mov.u32 %r461, -1;
+ shfl.sync.bfly.b32 %r462|%p135, %r458, %r460, %r459, %r461;
+ mov.b32 %f372, %r462;
add.f32 %f373, %f470, %f372;
- mov.b32 %r464, %f373;
- mov.u32 %r465, 8;
- shfl.sync.bfly.b32 %r466|%p136, %r464, %r465, %r460, %r462;
- mov.b32 %f374, %r466;
+ mov.b32 %r463, %f373;
+ mov.u32 %r464, 8;
+ shfl.sync.bfly.b32 %r465|%p136, %r463, %r464, %r459, %r461;
+ mov.b32 %f374, %r465;
add.f32 %f375, %f373, %f374;
- mov.b32 %r467, %f375;
- mov.u32 %r468, 4;
- shfl.sync.bfly.b32 %r469|%p137, %r467, %r468, %r460, %r462;
- mov.b32 %f376, %r469;
+ mov.b32 %r466, %f375;
+ mov.u32 %r467, 4;
+ shfl.sync.bfly.b32 %r468|%p137, %r466, %r467, %r459, %r461;
+ mov.b32 %f376, %r468;
add.f32 %f377, %f375, %f376;
- mov.b32 %r470, %f377;
- mov.u32 %r471, 2;
- shfl.sync.bfly.b32 %r472|%p138, %r470, %r471, %r460, %r462;
- mov.b32 %f378, %r472;
+ mov.b32 %r469, %f377;
+ mov.u32 %r470, 2;
+ shfl.sync.bfly.b32 %r471|%p138, %r469, %r470, %r459, %r461;
+ mov.b32 %f378, %r471;
add.f32 %f379, %f377, %f378;
- mov.b32 %r473, %f379;
- mov.u32 %r474, 1;
- shfl.sync.bfly.b32 %r475|%p139, %r473, %r474, %r460, %r462;
- mov.b32 %f380, %r475;
+ mov.b32 %r472, %f379;
+ mov.u32 %r473, 1;
+ shfl.sync.bfly.b32 %r474|%p139, %r472, %r473, %r459, %r461;
+ mov.b32 %f380, %r474;
add.f32 %f471, %f379, %f380;
$L__BB0_157:
bar.sync 0;
- setp.ne.s32 %p140, %r5, 0;
+ setp.ne.s32 %p140, %r6, 0;
@%p140 bra $L__BB0_160;
- mul.lo.s32 %r103, %r78, %r588;
- add.s32 %r476, %r77, %r103;
- setp.ge.s32 %p141, %r476, %r155;
+ mul.lo.s32 %r103, %r78, %r586;
+ add.s32 %r475, %r77, %r103;
+ setp.ge.s32 %p141, %r475, %r155;
@%p141 bra $L__BB0_160;
ld.param.u64 %rd134, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r479, %r79, %r103;
- mul.wide.s32 %rd129, %r479, 4;
+ add.s32 %r478, %r79, %r103;
+ mul.wide.s32 %rd129, %r478, 4;
add.s64 %rd128, %rd134, %rd129;
add.f32 %f381, %f471, 0f00000000;
- mov.b32 %r480, %f381;
- selp.b32 %r478, %r480, 0, %p127;
-
- st.global.cs.v2.s32 [%rd128], {%r102,%r478};
+ mov.b32 %r479, %f381;
+ selp.b32 %r477, %r479, 0, %p127;
+
+ st.global.cs.v2.s32 [%rd128], {%r102,%r477};
$L__BB0_160:
- add.s32 %r588, %r588, 1;
- setp.lt.s32 %p143, %r588, %r74;
+ add.s32 %r586, %r586, 1;
+ setp.lt.s32 %p143, %r586, %r74;
@%p143 bra $L__BB0_139;
$L__BB0_161:
ret;
11: CombinedSchedulerTest.LayerNormBackward/dtype_float_batch_216_hidden_1024
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 47→ 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T48, Tensor<float, 2, 2> T53, Tensor<int64_t, 1, 1> T58) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T34 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T30 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T31 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T48
// Allocate global tensor T53
__syncthreads();
Array<float, 4, 4> T49;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T49[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T54;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T54[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T52;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T47;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T47[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
}
Array<float, 1, 1> T32;
T32[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T32[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T33;
T33[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216)) {
T33[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T33[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T46;
T46[0] = 0.000000000e+00f;
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
T46[0]
= T46[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T57[0]
= T57[0]
+ T13[0];
T47[i9]
= T47[i9]
+ T21[0];
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
Array<float, 1, 1> T9;
T9[0]
= T39[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T46[0]
= T46[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T45[i9]
- T32[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T33[0];
Array<float, 1, 1> T21;
T21[0]
= T40[i9]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T57[0]
= T57[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
T47[i9]
= T47[i9]
+ T21[0];
}
}
}
warp::warpReduceTIDX<false, true>(T11[0], T46[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T14[0], T57[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T42[i11];
Array<float, 1, 1> T25;
T25[0]
= T38[i11]
* T24[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T25[0];
Array<float, 1, 1> T27;
T27[0]
= T44[i11]
- T32[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T33[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T35[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T49[i6], T47[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T54[i7], T52[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T48[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T49[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T53[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T54[0]);
}
}
// Allocate global tensor T58
grid_sync::sync<false, true, false, true, true>(T58[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
Array<float, 2, 1> T56;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T55;
T55.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T55[0], &*(volatile float*)&T53[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T56[i13]
= T56[i13]
+ T55[i13];
}
}
Array<float, 2, 2> T37;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T37[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
warp::warpReduceTIDX<false, true>(T37[i15], T56[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i12))], &T37[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i16) {
Array<float, 2, 1> T51;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i18 = 0; i18 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i18) {
Array<float, 2, 2> T50;
T50.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i18)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T50[0], &*(volatile float*)&T48[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) + ((((nvfuser_index_t)blockDim.x) * i2) * i18))]);
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T51[i17]
= T51[i17]
+ T50[i17];
}
}
Array<float, 2, 2> T36;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T36[i19] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
warp::warpReduceTIDX<false, true>(T36[i19], T51[i19], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16)) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i16))], &T36[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T30) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
+ T41[i8];
}
} else {
Array<float, 4, 4> T41;
T41.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T41[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T52[i8]
= T52[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T31) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -149,20 +149,20 @@
Array<float, 1, 1> T57;
T57[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T40;
T40.set(float(0));
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T45;
T45.set(float(0));
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T43;
T43.set(float(0));
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T39;
T39.set(float(0));
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
= T43[i9];
@@ -198,26 +198,26 @@
}
} else {
Array<float, 4, 4> T40;
T40.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T40[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T43;
T43.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T43[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T39;
T39.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T39[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T8;
T8[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
Array<float, 4, 4> T44;
T44.set(float(0));
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T42;
T42.set(float(0));
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T38;
T38.set(float(0));
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T35[0]);
} else {
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T31[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T42;
T42.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T42[0], &T34[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T38;
T38.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 216))) {
- loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T38[0], &T30[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T35 = T38;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<184>;
.reg .f32 %f<480>;
.reg .b32 %r<597>;
.reg .f64 %fd<3>;
.reg .b64 %rd<137>;
ld.param.v2.u32 {%r154, %r155}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r164, %r165}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r168, %r169}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r190, %r155, 3;
shr.s32 %r191, %r190, 31;
shr.u32 %r192, %r191, 30;
add.s32 %r193, %r190, %r192;
shr.s32 %r194, %r193, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r195, %r194, %r2;
add.s32 %r196, %r195, 31;
shr.s32 %r197, %r196, 31;
shr.u32 %r198, %r197, 27;
add.s32 %r199, %r196, %r198;
shr.u32 %r200, %r199, 5;
mov.u32 %r3, %ntid.y;
mul.lo.s32 %r201, %r3, %r200;
shl.b32 %r202, %r201, 7;
cvt.u64.u32 %rd1, %r202;
mul.lo.s32 %r203, %r3, %r194;
shl.b32 %r204, %r203, 4;
or.b32 %r205, %r204, 15;
and.b32 %r4, %r205, -16;
add.s32 %r206, %r205, %r4;
and.b32 %r207, %r206, -16;
cvt.s64.s32 %rd2, %r207;
mov.u64 %rd44, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p6, %r5, %r194;
shl.b32 %r6, %r5, 2;
or.b32 %r208, %r6, 3;
setp.lt.s32 %p7, %r208, %r155;
and.pred %p1, %p7, %p6;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p8, %r7, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r209, smem_ptr; }
// end inline asm
shl.b32 %r212, %r5, 4;
add.s32 %r210, %r209, %r212;
mul.wide.s32 %rd48, %r6, 4;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r211, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r211, 0;
cp.async.ca.shared.global [%r210], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r213, %r3, 215;
div.s32 %r214, %r213, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r215, %r8, %r214;
add.s32 %r216, %r215, -1;
div.s32 %r9, %r216, %r8;
setp.gt.s32 %p10, %r9, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r155;
cvt.s64.s32 %rd49, %r4;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r218, %ctaid.y;
mul.lo.s32 %r219, %r9, %r3;
mul.lo.s32 %r10, %r219, %r218;
shl.b32 %r220, %r7, 2;
shl.b32 %r221, %r5, 4;
mad.lo.s32 %r11, %r220, %r155, %r221;
mul.lo.s32 %r222, %r155, %r7;
cvt.s64.s32 %rd53, %r222;
cvt.s64.s32 %rd54, %r6;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r223, %r10, %r155;
cvt.s64.s32 %rd6, %r223;
mul.lo.s32 %r12, %r155, %r3;
mul.lo.s32 %r13, %r9, %r218;
add.s32 %r14, %r222, %r6;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r14, 4;
add.s64 %rd7, %rd55, %rd56;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r224, %tid.z;
mad.lo.s32 %r225, %r224, %r3, %r7;
shr.u32 %r15, %r2, 5;
mul.lo.s32 %r226, %r225, %r15;
shr.u32 %r16, %r5, 5;
add.s32 %r227, %r226, %r16;
mul.wide.u32 %rd57, %r227, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r5, 31;
add.s32 %r228, %r226, %r17;
mul.wide.u32 %rd58, %r228, 4;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r6, 4;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r225, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
mov.u32 %r565, 0;
mov.f32 %f438, 0f00000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r231, smem_ptr; }
// end inline asm
add.s32 %r232, %r11, %r231;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
add.s32 %r258, %r11, %r257;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r229, %r565, %r3, %r7;
add.s32 %r230, %r229, %r10;
setp.gt.s32 %p12, %r230, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r234, %r12, %r565;
cvt.s64.s32 %rd65, %r234;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r233, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r233, 0;
cp.async.ca.shared.global [%r232], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r235, %r13, %r565;
mad.lo.s32 %r236, %r235, %r3, %r7;
setp.lt.s32 %p14, %r236, 216;
@%p14 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r245, %r13, %r565;
mad.lo.s32 %r246, %r245, %r3, %r7;
setp.gt.s32 %p15, %r246, 215;
mov.u32 %r566, 0;
mov.u32 %r567, %r566;
mov.u32 %r568, %r566;
mov.u32 %r569, %r566;
@%p15 bra $L__BB0_15;
ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r566, 0;
mov.u32 %r567, %r566;
mov.u32 %r568, %r566;
mov.u32 %r569, %r566;
$L__BB0_15:
add.s32 %r255, %r13, %r565;
mad.lo.s32 %r33, %r255, %r3, %r7;
mov.b32 %f125, %r569;
add.f32 %f455, %f455, %f125;
mov.b32 %f126, %r568;
add.f32 %f454, %f454, %f126;
mov.b32 %f127, %r567;
add.f32 %f453, %f453, %f127;
mov.b32 %f128, %r566;
add.f32 %f452, %f452, %f128;
setp.gt.s32 %p16, %r33, 215;
mov.f32 %f436, 0f00000000;
@%p16 bra $L__BB0_17;
mul.lo.s32 %r256, %r33, %r164;
mul.wide.s32 %rd69, %r256, 4;
add.s64 %rd70, %rd15, %rd69;
ld.global.f32 %f436, [%rd70];
$L__BB0_17:
setp.lt.s32 %p17, %r33, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_19;
mul.lo.s32 %r260, %r12, %r565;
cvt.s64.s32 %rd73, %r260;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
mov.f32 %f442, 0f00000000;
mov.f32 %f437, %f442;
@%p16 bra $L__BB0_21;
mul.lo.s32 %r261, %r33, %r168;
mul.wide.s32 %rd77, %r261, 4;
add.s64 %rd78, %rd16, %rd77;
ld.global.f32 %f437, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f443, %f442;
@%p18 bra $L__BB0_23;
ld.shared.v4.f32 {%f132, %f133, %f134, %f135}, [%rd11];
ld.shared.v4.f32 {%f137, %f138, %f139, %f140}, [%rd7];
mul.f32 %f142, %f132, %f137;
add.f32 %f143, %f142, 0f00000000;
ld.shared.v4.f32 {%f144, %f145, %f146, %f147}, [%rd9];
sub.f32 %f149, %f144, %f436;
mul.f32 %f150, %f437, %f149;
fma.rn.f32 %f151, %f142, %f150, 0f00000000;
fma.rn.f32 %f438, %f150, %f137, %f438;
mul.f32 %f154, %f133, %f138;
add.f32 %f155, %f143, %f154;
sub.f32 %f157, %f145, %f436;
mul.f32 %f158, %f437, %f157;
fma.rn.f32 %f159, %f154, %f158, %f151;
fma.rn.f32 %f439, %f158, %f138, %f439;
mul.f32 %f162, %f134, %f139;
add.f32 %f163, %f155, %f162;
sub.f32 %f165, %f146, %f436;
mul.f32 %f166, %f437, %f165;
fma.rn.f32 %f167, %f162, %f166, %f159;
fma.rn.f32 %f440, %f166, %f139, %f440;
mul.f32 %f170, %f135, %f140;
add.f32 %f443, %f163, %f170;
sub.f32 %f172, %f147, %f436;
mul.f32 %f173, %f437, %f172;
fma.rn.f32 %f442, %f170, %f173, %f167;
fma.rn.f32 %f441, %f173, %f140, %f441;
$L__BB0_23:
mov.b32 %r262, %f443;
mov.u32 %r263, 31;
mov.u32 %r264, 16;
mov.u32 %r265, -1;
shfl.sync.bfly.b32 %r266|%p21, %r262, %r264, %r263, %r265;
mov.b32 %f174, %r266;
add.f32 %f175, %f443, %f174;
mov.b32 %r267, %f175;
mov.u32 %r268, 8;
shfl.sync.bfly.b32 %r269|%p22, %r267, %r268, %r263, %r265;
mov.b32 %f176, %r269;
add.f32 %f177, %f175, %f176;
mov.b32 %r270, %f177;
mov.u32 %r271, 4;
shfl.sync.bfly.b32 %r272|%p23, %r270, %r271, %r263, %r265;
mov.b32 %f178, %r272;
add.f32 %f179, %f177, %f178;
mov.b32 %r273, %f179;
mov.u32 %r274, 2;
shfl.sync.bfly.b32 %r275|%p24, %r273, %r274, %r263, %r265;
mov.b32 %f180, %r275;
add.f32 %f181, %f179, %f180;
mov.b32 %r276, %f181;
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r278|%p25, %r276, %r277, %r263, %r265;
mov.b32 %f182, %r278;
add.f32 %f445, %f181, %f182;
bar.sync 0;
setp.ne.s32 %p26, %r17, 0;
@%p26 bra $L__BB0_25;
st.shared.f32 [%rd8], %f445;
$L__BB0_25:
setp.ne.s32 %p27, %r16, 0;
bar.sync 0;
@%p27 bra $L__BB0_29;
setp.ge.u32 %p28, %r17, %r15;
mov.f32 %f444, 0f00000000;
@%p28 bra $L__BB0_28;
ld.shared.f32 %f444, [%rd10];
$L__BB0_28:
mov.b32 %r279, %f444;
mov.u32 %r280, 31;
mov.u32 %r281, 16;
mov.u32 %r282, -1;
shfl.sync.bfly.b32 %r283|%p29, %r279, %r281, %r280, %r282;
mov.b32 %f184, %r283;
add.f32 %f185, %f444, %f184;
mov.b32 %r284, %f185;
mov.u32 %r285, 8;
shfl.sync.bfly.b32 %r286|%p30, %r284, %r285, %r280, %r282;
mov.b32 %f186, %r286;
add.f32 %f187, %f185, %f186;
mov.b32 %r287, %f187;
mov.u32 %r288, 4;
shfl.sync.bfly.b32 %r289|%p31, %r287, %r288, %r280, %r282;
mov.b32 %f188, %r289;
add.f32 %f189, %f187, %f188;
mov.b32 %r290, %f189;
mov.u32 %r291, 2;
shfl.sync.bfly.b32 %r292|%p32, %r290, %r291, %r280, %r282;
mov.b32 %f190, %r292;
add.f32 %f191, %f189, %f190;
mov.b32 %r293, %f191;
mov.u32 %r294, 1;
shfl.sync.bfly.b32 %r295|%p33, %r293, %r294, %r280, %r282;
mov.b32 %f192, %r295;
add.f32 %f445, %f191, %f192;
$L__BB0_29:
setp.ne.s32 %p180, %r17, 0;
bar.sync 0;
mov.b32 %r296, %f442;
mov.u32 %r297, 31;
mov.u32 %r298, 16;
mov.u32 %r299, -1;
shfl.sync.bfly.b32 %r300|%p34, %r296, %r298, %r297, %r299;
mov.b32 %f193, %r300;
add.f32 %f194, %f442, %f193;
mov.b32 %r301, %f194;
mov.u32 %r302, 8;
shfl.sync.bfly.b32 %r303|%p35, %r301, %r302, %r297, %r299;
mov.b32 %f195, %r303;
add.f32 %f196, %f194, %f195;
mov.b32 %r304, %f196;
mov.u32 %r305, 4;
shfl.sync.bfly.b32 %r306|%p36, %r304, %r305, %r297, %r299;
mov.b32 %f197, %r306;
add.f32 %f198, %f196, %f197;
mov.b32 %r307, %f198;
mov.u32 %r308, 2;
shfl.sync.bfly.b32 %r309|%p37, %r307, %r308, %r297, %r299;
mov.b32 %f199, %r309;
add.f32 %f200, %f198, %f199;
mov.b32 %r310, %f200;
mov.u32 %r311, 1;
shfl.sync.bfly.b32 %r312|%p38, %r310, %r311, %r297, %r299;
mov.b32 %f201, %r312;
add.f32 %f447, %f200, %f201;
bar.sync 0;
@%p180 bra $L__BB0_31;
st.shared.f32 [%rd8], %f447;
$L__BB0_31:
setp.eq.s32 %p183, %r17, 0;
setp.ne.s32 %p181, %r16, 0;
bar.sync 0;
add.f32 %f202, %f445, 0f00000000;
selp.f32 %f37, %f202, 0f00000000, %p183;
@%p181 bra $L__BB0_35;
setp.ge.u32 %p41, %r17, %r15;
mov.f32 %f446, 0f00000000;
@%p41 bra $L__BB0_34;
ld.shared.f32 %f446, [%rd10];
$L__BB0_34:
mov.b32 %r313, %f446;
mov.u32 %r314, 31;
mov.u32 %r315, 16;
mov.u32 %r316, -1;
shfl.sync.bfly.b32 %r317|%p42, %r313, %r315, %r314, %r316;
mov.b32 %f204, %r317;
add.f32 %f205, %f446, %f204;
mov.b32 %r318, %f205;
mov.u32 %r319, 8;
shfl.sync.bfly.b32 %r320|%p43, %r318, %r319, %r314, %r316;
mov.b32 %f206, %r320;
add.f32 %f207, %f205, %f206;
mov.b32 %r321, %f207;
mov.u32 %r322, 4;
shfl.sync.bfly.b32 %r323|%p44, %r321, %r322, %r314, %r316;
mov.b32 %f208, %r323;
add.f32 %f209, %f207, %f208;
mov.b32 %r324, %f209;
mov.u32 %r325, 2;
shfl.sync.bfly.b32 %r326|%p45, %r324, %r325, %r314, %r316;
mov.b32 %f210, %r326;
add.f32 %f211, %f209, %f210;
mov.b32 %r327, %f211;
mov.u32 %r328, 1;
shfl.sync.bfly.b32 %r329|%p46, %r327, %r328, %r314, %r316;
mov.b32 %f212, %r329;
add.f32 %f447, %f211, %f212;
$L__BB0_35:
bar.sync 0;
setp.ne.s32 %p47, %r5, 0;
@%p47 bra $L__BB0_37;
st.shared.f32 [%rd12], %f37;
$L__BB0_37:
bar.sync 0;
ld.shared.f32 %f42, [%rd12];
bar.sync 0;
@%p47 bra $L__BB0_39;
setp.eq.s32 %p182, %r17, 0;
add.f32 %f213, %f447, 0f00000000;
selp.f32 %f214, %f213, 0f00000000, %p182;
st.shared.f32 [%rd12], %f214;
$L__BB0_39:
bar.sync 0;
ld.shared.f32 %f43, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_41;
mul.f32 %f215, %f437, %f1;
ld.shared.v4.f32 {%f216, %f217, %f218, %f219}, [%rd11];
ld.shared.v4.f32 {%f221, %f222, %f223, %f224}, [%rd7];
mul.f32 %f226, %f216, %f221;
mul.f32 %f227, %f226, %f2;
ld.shared.v4.f32 {%f228, %f229, %f230, %f231}, [%rd9];
sub.f32 %f233, %f228, %f436;
mul.f32 %f234, %f437, %f233;
sub.f32 %f235, %f227, %f42;
mul.f32 %f236, %f43, %f234;
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f215, %f237;
mov.b32 %r330, %f238;
mul.f32 %f241, %f217, %f222;
mul.f32 %f242, %f241, %f2;
sub.f32 %f244, %f229, %f436;
mul.f32 %f245, %f437, %f244;
sub.f32 %f246, %f242, %f42;
mul.f32 %f247, %f43, %f245;
sub.f32 %f248, %f246, %f247;
mul.f32 %f249, %f215, %f248;
mov.b32 %r331, %f249;
mul.f32 %f252, %f218, %f223;
mul.f32 %f253, %f252, %f2;
sub.f32 %f255, %f230, %f436;
mul.f32 %f256, %f437, %f255;
sub.f32 %f257, %f253, %f42;
mul.f32 %f258, %f43, %f256;
sub.f32 %f259, %f257, %f258;
mul.f32 %f260, %f215, %f259;
mov.b32 %r332, %f260;
mul.f32 %f263, %f219, %f224;
mul.f32 %f264, %f263, %f2;
sub.f32 %f266, %f231, %f436;
mul.f32 %f267, %f437, %f266;
sub.f32 %f268, %f264, %f42;
mul.f32 %f269, %f43, %f267;
sub.f32 %f270, %f268, %f269;
mul.f32 %f271, %f215, %f270;
mov.b32 %r333, %f271;
mad.lo.s32 %r334, %r565, %r3, %r10;
mad.lo.s32 %r335, %r334, %r155, %r14;
mul.wide.s32 %rd80, %r335, 4;
add.s64 %rd79, %rd38, %rd80;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_41:
add.s32 %r565, %r565, 1;
setp.lt.s32 %p51, %r565, %r9;
@%p51 bra $L__BB0_5;
bra.uni $L__BB0_42;
$L__BB0_3:
mov.f32 %f438, 0f00000000;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_42:
mov.u32 %r336, %tid.z;
mad.lo.s32 %r35, %r336, %r3, %r7;
mad.lo.s32 %r36, %r35, %r2, %r5;
mul.wide.u32 %rd81, %r36, 4;
add.s64 %rd22, %rd44, %rd81;
clz.b32 %r337, %r3;
mov.u32 %r338, 31;
sub.s32 %r339, %r338, %r337;
mov.u32 %r340, 1;
shl.b32 %r37, %r340, %r339;
setp.lt.u32 %p52, %r7, %r37;
add.s32 %r341, %r37, %r7;
setp.lt.u32 %p53, %r341, %r3;
and.pred %p5, %p52, %p53;
shl.b32 %r342, %r2, %r339;
add.s32 %r343, %r36, %r342;
mul.wide.s32 %rd83, %r343, 4;
add.s64 %rd23, %rd44, %rd83;
shr.u32 %r344, %r37, 31;
add.s32 %r345, %r37, %r344;
shr.s32 %r584, %r345, 1;
st.shared.f32 [%rd22], %f438;
bar.sync 0;
not.pred %p54, %p5;
@%p54 bra $L__BB0_44;
ld.shared.f32 %f272, [%rd23];
ld.shared.f32 %f273, [%rd22];
add.f32 %f274, %f272, %f273;
st.shared.f32 [%rd22], %f274;
$L__BB0_44:
setp.lt.s32 %p55, %r37, 4;
bar.sync 0;
@%p55 bra $L__BB0_49;
mov.u32 %r570, %r584;
$L__BB0_46:
setp.ge.u32 %p56, %r7, %r570;
@%p56 bra $L__BB0_48;
mad.lo.s32 %r346, %r570, %r2, %r36;
mul.wide.s32 %rd84, %r346, 4;
add.s64 %rd86, %rd44, %rd84;
ld.shared.f32 %f275, [%rd22];
ld.shared.f32 %f276, [%rd86];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd22], %f277;
$L__BB0_48:
bar.sync 0;
shr.u32 %r40, %r570, 1;
setp.gt.u32 %p57, %r570, 3;
mov.u32 %r570, %r40;
@%p57 bra $L__BB0_46;
$L__BB0_49:
add.s32 %r348, %r36, %r2;
mul.wide.u32 %rd87, %r348, 4;
add.s64 %rd24, %rd44, %rd87;
setp.ne.s32 %p58, %r7, 0;
mov.u32 %r571, 0;
@%p58 bra $L__BB0_53;
setp.lt.u32 %p59, %r3, 2;
ld.shared.f32 %f278, [%rd22];
add.f32 %f456, %f278, 0f00000000;
@%p59 bra $L__BB0_52;
ld.shared.f32 %f279, [%rd24];
add.f32 %f456, %f456, %f279;
$L__BB0_52:
mov.b32 %r571, %f456;
$L__BB0_53:
bar.sync 0;
st.shared.f32 [%rd22], %f439;
bar.sync 0;
@%p54 bra $L__BB0_55;
ld.shared.f32 %f280, [%rd23];
ld.shared.f32 %f281, [%rd22];
add.f32 %f282, %f280, %f281;
st.shared.f32 [%rd22], %f282;
$L__BB0_55:
bar.sync 0;
@%p55 bra $L__BB0_60;
mov.u32 %r572, %r584;
$L__BB0_57:
setp.ge.u32 %p62, %r7, %r572;
@%p62 bra $L__BB0_59;
mad.lo.s32 %r349, %r572, %r2, %r36;
mul.wide.s32 %rd89, %r349, 4;
add.s64 %rd91, %rd44, %rd89;
ld.shared.f32 %f283, [%rd22];
ld.shared.f32 %f284, [%rd91];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd22], %f285;
$L__BB0_59:
bar.sync 0;
shr.u32 %r44, %r572, 1;
setp.gt.u32 %p63, %r572, 3;
mov.u32 %r572, %r44;
@%p63 bra $L__BB0_57;
$L__BB0_60:
mov.u32 %r573, 0;
@%p58 bra $L__BB0_64;
setp.lt.u32 %p65, %r3, 2;
ld.shared.f32 %f286, [%rd22];
add.f32 %f457, %f286, 0f00000000;
@%p65 bra $L__BB0_63;
ld.shared.f32 %f287, [%rd24];
add.f32 %f457, %f457, %f287;
$L__BB0_63:
mov.b32 %r573, %f457;
$L__BB0_64:
bar.sync 0;
st.shared.f32 [%rd22], %f440;
bar.sync 0;
@%p54 bra $L__BB0_66;
ld.shared.f32 %f288, [%rd23];
ld.shared.f32 %f289, [%rd22];
add.f32 %f290, %f288, %f289;
st.shared.f32 [%rd22], %f290;
$L__BB0_66:
bar.sync 0;
@%p55 bra $L__BB0_71;
mov.u32 %r574, %r584;
$L__BB0_68:
setp.ge.u32 %p68, %r7, %r574;
@%p68 bra $L__BB0_70;
mad.lo.s32 %r351, %r574, %r2, %r36;
mul.wide.s32 %rd92, %r351, 4;
add.s64 %rd94, %rd44, %rd92;
ld.shared.f32 %f291, [%rd22];
ld.shared.f32 %f292, [%rd94];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd22], %f293;
$L__BB0_70:
bar.sync 0;
shr.u32 %r48, %r574, 1;
setp.gt.u32 %p69, %r574, 3;
mov.u32 %r574, %r48;
@%p69 bra $L__BB0_68;
$L__BB0_71:
mov.u32 %r575, 0;
@%p58 bra $L__BB0_75;
setp.lt.u32 %p71, %r3, 2;
ld.shared.f32 %f294, [%rd22];
add.f32 %f458, %f294, 0f00000000;
@%p71 bra $L__BB0_74;
ld.shared.f32 %f295, [%rd24];
add.f32 %f458, %f458, %f295;
$L__BB0_74:
mov.b32 %r575, %f458;
$L__BB0_75:
bar.sync 0;
st.shared.f32 [%rd22], %f441;
bar.sync 0;
@%p54 bra $L__BB0_77;
ld.shared.f32 %f296, [%rd23];
ld.shared.f32 %f297, [%rd22];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd22], %f298;
$L__BB0_77:
bar.sync 0;
@%p55 bra $L__BB0_82;
mov.u32 %r576, %r584;
$L__BB0_79:
setp.ge.u32 %p74, %r7, %r576;
@%p74 bra $L__BB0_81;
mad.lo.s32 %r353, %r576, %r2, %r36;
mul.wide.s32 %rd95, %r353, 4;
add.s64 %rd97, %rd44, %rd95;
ld.shared.f32 %f299, [%rd22];
ld.shared.f32 %f300, [%rd97];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd22], %f301;
$L__BB0_81:
bar.sync 0;
shr.u32 %r52, %r576, 1;
setp.gt.u32 %p75, %r576, 3;
mov.u32 %r576, %r52;
@%p75 bra $L__BB0_79;
$L__BB0_82:
mov.u32 %r577, 0;
@%p58 bra $L__BB0_86;
setp.lt.u32 %p77, %r3, 2;
ld.shared.f32 %f302, [%rd22];
add.f32 %f459, %f302, 0f00000000;
@%p77 bra $L__BB0_85;
ld.shared.f32 %f303, [%rd24];
add.f32 %f459, %f459, %f303;
$L__BB0_85:
mov.b32 %r577, %f459;
$L__BB0_86:
bar.sync 0;
st.shared.f32 [%rd22], %f452;
bar.sync 0;
@%p54 bra $L__BB0_88;
ld.shared.f32 %f304, [%rd23];
ld.shared.f32 %f305, [%rd22];
add.f32 %f306, %f304, %f305;
st.shared.f32 [%rd22], %f306;
$L__BB0_88:
bar.sync 0;
@%p55 bra $L__BB0_93;
mov.u32 %r578, %r584;
$L__BB0_90:
setp.ge.u32 %p80, %r7, %r578;
@%p80 bra $L__BB0_92;
mad.lo.s32 %r355, %r578, %r2, %r36;
mul.wide.s32 %rd98, %r355, 4;
add.s64 %rd100, %rd44, %rd98;
ld.shared.f32 %f307, [%rd22];
ld.shared.f32 %f308, [%rd100];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd22], %f309;
$L__BB0_92:
bar.sync 0;
shr.u32 %r56, %r578, 1;
setp.gt.u32 %p81, %r578, 3;
mov.u32 %r578, %r56;
@%p81 bra $L__BB0_90;
$L__BB0_93:
mov.u32 %r579, 0;
@%p58 bra $L__BB0_97;
setp.lt.u32 %p83, %r3, 2;
ld.shared.f32 %f310, [%rd22];
add.f32 %f460, %f310, 0f00000000;
@%p83 bra $L__BB0_96;
ld.shared.f32 %f311, [%rd24];
add.f32 %f460, %f460, %f311;
$L__BB0_96:
mov.b32 %r579, %f460;
$L__BB0_97:
bar.sync 0;
st.shared.f32 [%rd22], %f453;
bar.sync 0;
@%p54 bra $L__BB0_99;
ld.shared.f32 %f312, [%rd23];
ld.shared.f32 %f313, [%rd22];
add.f32 %f314, %f312, %f313;
st.shared.f32 [%rd22], %f314;
$L__BB0_99:
bar.sync 0;
@%p55 bra $L__BB0_104;
mov.u32 %r580, %r584;
$L__BB0_101:
setp.ge.u32 %p86, %r7, %r580;
@%p86 bra $L__BB0_103;
mad.lo.s32 %r357, %r580, %r2, %r36;
mul.wide.s32 %rd101, %r357, 4;
add.s64 %rd103, %rd44, %rd101;
ld.shared.f32 %f315, [%rd22];
ld.shared.f32 %f316, [%rd103];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd22], %f317;
$L__BB0_103:
bar.sync 0;
shr.u32 %r60, %r580, 1;
setp.gt.u32 %p87, %r580, 3;
mov.u32 %r580, %r60;
@%p87 bra $L__BB0_101;
$L__BB0_104:
mov.u32 %r581, 0;
@%p58 bra $L__BB0_108;
setp.lt.u32 %p89, %r3, 2;
ld.shared.f32 %f318, [%rd22];
add.f32 %f461, %f318, 0f00000000;
@%p89 bra $L__BB0_107;
ld.shared.f32 %f319, [%rd24];
add.f32 %f461, %f461, %f319;
$L__BB0_107:
mov.b32 %r581, %f461;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd22], %f454;
bar.sync 0;
@%p54 bra $L__BB0_110;
ld.shared.f32 %f320, [%rd23];
ld.shared.f32 %f321, [%rd22];
add.f32 %f322, %f320, %f321;
st.shared.f32 [%rd22], %f322;
$L__BB0_110:
bar.sync 0;
@%p55 bra $L__BB0_115;
mov.u32 %r582, %r584;
$L__BB0_112:
setp.ge.u32 %p92, %r7, %r582;
@%p92 bra $L__BB0_114;
mad.lo.s32 %r359, %r582, %r2, %r36;
mul.wide.s32 %rd104, %r359, 4;
add.s64 %rd106, %rd44, %rd104;
ld.shared.f32 %f323, [%rd22];
ld.shared.f32 %f324, [%rd106];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd22], %f325;
$L__BB0_114:
bar.sync 0;
shr.u32 %r64, %r582, 1;
setp.gt.u32 %p93, %r582, 3;
mov.u32 %r582, %r64;
@%p93 bra $L__BB0_112;
$L__BB0_115:
mov.u32 %r583, 0;
@%p58 bra $L__BB0_119;
setp.lt.u32 %p95, %r3, 2;
ld.shared.f32 %f326, [%rd22];
add.f32 %f462, %f326, 0f00000000;
@%p95 bra $L__BB0_118;
ld.shared.f32 %f327, [%rd24];
add.f32 %f462, %f462, %f327;
$L__BB0_118:
mov.b32 %r583, %f462;
$L__BB0_119:
bar.sync 0;
st.shared.f32 [%rd22], %f455;
bar.sync 0;
@%p54 bra $L__BB0_121;
ld.shared.f32 %f328, [%rd23];
ld.shared.f32 %f329, [%rd22];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd22], %f330;
$L__BB0_121:
bar.sync 0;
@%p55 bra $L__BB0_125;
$L__BB0_122:
setp.ge.u32 %p98, %r7, %r584;
@%p98 bra $L__BB0_124;
mad.lo.s32 %r361, %r584, %r2, %r36;
mul.wide.s32 %rd107, %r361, 4;
add.s64 %rd109, %rd44, %rd107;
ld.shared.f32 %f331, [%rd22];
ld.shared.f32 %f332, [%rd109];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd22], %f333;
$L__BB0_124:
bar.sync 0;
shr.u32 %r68, %r584, 1;
setp.gt.u32 %p99, %r584, 3;
mov.u32 %r584, %r68;
@%p99 bra $L__BB0_122;
$L__BB0_125:
mov.u32 %r585, 0;
@%p58 bra $L__BB0_129;
setp.lt.u32 %p101, %r3, 2;
ld.shared.f32 %f334, [%rd22];
add.f32 %f463, %f334, 0f00000000;
@%p101 bra $L__BB0_128;
ld.shared.f32 %f335, [%rd24];
add.f32 %f463, %f463, %f335;
$L__BB0_128:
mov.b32 %r585, %f463;
$L__BB0_129:
setp.eq.s32 %p179, %r7, 0;
and.pred %p178, %p179, %p1;
bar.sync 0;
@%p178 bra $L__BB0_130;
bra.uni $L__BB0_131;
$L__BB0_130:
shl.b32 %r564, %r5, 2;
mov.u32 %r371, %ctaid.y;
mad.lo.s32 %r372, %r155, %r371, %r564;
mul.wide.s32 %rd112, %r372, 4;
add.s64 %rd110, %rd41, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd110], {%r571,%r573,%r575,%r577};
// end inline asm
add.s64 %rd111, %rd42, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd111], {%r579,%r581,%r583,%r585};
// end inline asm
$L__BB0_131:
mov.u32 %r71, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r373, %r5, %r7;
or.b32 %r375, %r373, %r336;
setp.ne.s32 %p102, %r375, 0;
@%p102 bra $L__BB0_135;
ld.param.u64 %rd136, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd136;
mov.u32 %r376, %ctaid.x;
mov.u32 %r377, %ctaid.z;
mov.u32 %r378, %nctaid.x;
mad.lo.s32 %r379, %r377, %r378, %r376;
mul.wide.s32 %rd114, %r379, 8;
add.s64 %rd27, %rd113, %rd114;
add.s32 %r380, %r8, -1;
setp.eq.s32 %p103, %r71, %r380;
cvt.s64.s32 %rd115, %r8;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p103;
atom.global.add.u64 %rd28, [%rd27], %rd118;
ld.volatile.global.u64 %rd119, [%rd27];
xor.b64 %rd120, %rd119, %rd28;
setp.lt.s64 %p104, %rd120, 0;
@%p104 bra $L__BB0_135;
mov.u32 %r586, 8;
$L__BB0_134:
// begin inline asm
nanosleep.u32 %r586;
// end inline asm
setp.lt.u32 %p105, %r586, 256;
selp.u32 %r383, 1, 0, %p105;
shl.b32 %r586, %r586, %r383;
ld.volatile.global.u64 %rd121, [%rd27];
xor.b64 %rd122, %rd121, %rd28;
setp.gt.s64 %p106, %rd122, -1;
@%p106 bra $L__BB0_134;
$L__BB0_135:
bar.sync 0;
add.s32 %r385, %r155, 1;
shr.u32 %r386, %r385, 31;
add.s32 %r387, %r385, %r386;
shr.s32 %r388, %r387, 1;
add.s32 %r389, %r3, %r388;
add.s32 %r390, %r389, -1;
div.s32 %r391, %r390, %r3;
add.s32 %r392, %r8, -1;
add.s32 %r393, %r392, %r391;
div.s32 %r74, %r393, %r8;
add.s32 %r75, %r392, %r2;
shl.b32 %r76, %r7, 1;
shl.b32 %r394, %r3, 1;
mad.lo.s32 %r79, %r394, %r71, %r76;
or.b32 %r77, %r79, 1;
mul.lo.s32 %r78, %r394, %r8;
shr.u32 %r80, %r2, 5;
mul.lo.s32 %r395, %r35, %r80;
shr.u32 %r81, %r5, 5;
add.s32 %r396, %r395, %r81;
mul.wide.u32 %rd123, %r396, 4;
add.s64 %rd29, %rd44, %rd123;
and.b32 %r82, %r5, 31;
add.s32 %r397, %r395, %r82;
mul.wide.u32 %rd125, %r397, 4;
add.s64 %rd30, %rd44, %rd125;
mov.u32 %r587, 0;
bra.uni $L__BB0_136;
$L__BB0_183:
add.s32 %r587, %r587, 1;
$L__BB0_136:
.pragma "nounroll";
setp.lt.s32 %p107, %r587, %r74;
@%p107 bra $L__BB0_162;
bra.uni $L__BB0_137;
$L__BB0_162:
div.s32 %r105, %r75, %r2;
setp.lt.s32 %p144, %r105, 1;
mov.f32 %f474, 0f00000000;
mov.f32 %f475, %f474;
@%p144 bra $L__BB0_168;
mul.lo.s32 %r482, %r78, %r587;
add.s32 %r106, %r77, %r482;
add.s32 %r107, %r79, %r482;
mov.u32 %r481, 0;
mov.f32 %f474, 0f00000000;
mov.u32 %r594, %r481;
$L__BB0_164:
.pragma "nounroll";
setp.ge.s32 %p145, %r106, %r155;
mov.u32 %r595, %r481;
mov.u32 %r596, %r481;
@%p145 bra $L__BB0_167;
mad.lo.s32 %r109, %r594, %r2, %r5;
setp.ge.s32 %p146, %r109, %r8;
mov.u32 %r595, %r481;
mov.u32 %r596, %r481;
@%p146 bra $L__BB0_167;
mad.lo.s32 %r489, %r109, %r155, %r107;
mul.wide.s32 %rd131, %r489, 4;
add.s64 %rd130, %rd42, %rd131;
// begin inline asm
ld.volatile.global.v2.s32 {%r596,%r595}, [%rd130];
// end inline asm
$L__BB0_167:
mov.b32 %f386, %r596;
add.f32 %f475, %f475, %f386;
mov.b32 %f387, %r595;
add.f32 %f474, %f474, %f387;
add.s32 %r594, %r594, 1;
setp.lt.s32 %p147, %r594, %r105;
@%p147 bra $L__BB0_164;
$L__BB0_168:
mov.b32 %r490, %f475;
mov.u32 %r491, 31;
mov.u32 %r492, 16;
mov.u32 %r493, -1;
shfl.sync.bfly.b32 %r494|%p148, %r490, %r492, %r491, %r493;
mov.b32 %f388, %r494;
add.f32 %f389, %f475, %f388;
mov.b32 %r495, %f389;
mov.u32 %r496, 8;
shfl.sync.bfly.b32 %r497|%p149, %r495, %r496, %r491, %r493;
mov.b32 %f390, %r497;
add.f32 %f391, %f389, %f390;
mov.b32 %r498, %f391;
mov.u32 %r499, 4;
shfl.sync.bfly.b32 %r500|%p150, %r498, %r499, %r491, %r493;
mov.b32 %f392, %r500;
add.f32 %f393, %f391, %f392;
mov.b32 %r501, %f393;
mov.u32 %r502, 2;
shfl.sync.bfly.b32 %r503|%p151, %r501, %r502, %r491, %r493;
mov.b32 %f394, %r503;
add.f32 %f395, %f393, %f394;
mov.b32 %r504, %f395;
mov.u32 %r505, 1;
shfl.sync.bfly.b32 %r506|%p152, %r504, %r505, %r491, %r493;
mov.b32 %f396, %r506;
add.f32 %f477, %f395, %f396;
bar.sync 0;
setp.ne.s32 %p153, %r82, 0;
@%p153 bra $L__BB0_170;
st.shared.f32 [%rd29], %f477;
$L__BB0_170:
setp.ne.s32 %p154, %r81, 0;
bar.sync 0;
@%p154 bra $L__BB0_174;
setp.ge.u32 %p155, %r82, %r80;
mov.f32 %f476, 0f00000000;
@%p155 bra $L__BB0_173;
ld.shared.f32 %f476, [%rd30];
$L__BB0_173:
mov.b32 %r507, %f476;
mov.u32 %r508, 31;
mov.u32 %r509, 16;
mov.u32 %r510, -1;
shfl.sync.bfly.b32 %r511|%p156, %r507, %r509, %r508, %r510;
mov.b32 %f398, %r511;
add.f32 %f399, %f476, %f398;
mov.b32 %r512, %f399;
mov.u32 %r513, 8;
shfl.sync.bfly.b32 %r514|%p157, %r512, %r513, %r508, %r510;
mov.b32 %f400, %r514;
add.f32 %f401, %f399, %f400;
mov.b32 %r515, %f401;
mov.u32 %r516, 4;
shfl.sync.bfly.b32 %r517|%p158, %r515, %r516, %r508, %r510;
mov.b32 %f402, %r517;
add.f32 %f403, %f401, %f402;
mov.b32 %r518, %f403;
mov.u32 %r519, 2;
shfl.sync.bfly.b32 %r520|%p159, %r518, %r519, %r508, %r510;
mov.b32 %f404, %r520;
add.f32 %f405, %f403, %f404;
mov.b32 %r521, %f405;
mov.u32 %r522, 1;
shfl.sync.bfly.b32 %r523|%p160, %r521, %r522, %r508, %r510;
mov.b32 %f406, %r523;
add.f32 %f477, %f405, %f406;
$L__BB0_174:
add.f32 %f407, %f477, 0f00000000;
mov.b32 %r524, %f407;
setp.eq.s32 %p162, %r82, 0;
selp.b32 %r115, %r524, 0, %p162;
bar.sync 0;
mov.b32 %r525, %f474;
mov.u32 %r526, 31;
mov.u32 %r527, 16;
mov.u32 %r528, -1;
shfl.sync.bfly.b32 %r529|%p163, %r525, %r527, %r526, %r528;
mov.b32 %f408, %r529;
add.f32 %f409, %f474, %f408;
mov.b32 %r530, %f409;
mov.u32 %r531, 8;
shfl.sync.bfly.b32 %r532|%p164, %r530, %r531, %r526, %r528;
mov.b32 %f410, %r532;
add.f32 %f411, %f409, %f410;
mov.b32 %r533, %f411;
mov.u32 %r534, 4;
shfl.sync.bfly.b32 %r535|%p165, %r533, %r534, %r526, %r528;
mov.b32 %f412, %r535;
add.f32 %f413, %f411, %f412;
mov.b32 %r536, %f413;
mov.u32 %r537, 2;
shfl.sync.bfly.b32 %r538|%p166, %r536, %r537, %r526, %r528;
mov.b32 %f414, %r538;
add.f32 %f415, %f413, %f414;
mov.b32 %r539, %f415;
mov.u32 %r540, 1;
shfl.sync.bfly.b32 %r541|%p167, %r539, %r540, %r526, %r528;
mov.b32 %f416, %r541;
add.f32 %f479, %f415, %f416;
bar.sync 0;
@%p153 bra $L__BB0_176;
st.shared.f32 [%rd29], %f479;
$L__BB0_176:
bar.sync 0;
@%p154 bra $L__BB0_180;
setp.ge.u32 %p169, %r82, %r80;
mov.f32 %f478, 0f00000000;
@%p169 bra $L__BB0_179;
ld.shared.f32 %f478, [%rd30];
$L__BB0_179:
mov.b32 %r542, %f478;
mov.u32 %r543, 31;
mov.u32 %r544, 16;
mov.u32 %r545, -1;
shfl.sync.bfly.b32 %r546|%p170, %r542, %r544, %r543, %r545;
mov.b32 %f418, %r546;
add.f32 %f419, %f478, %f418;
mov.b32 %r547, %f419;
mov.u32 %r548, 8;
shfl.sync.bfly.b32 %r549|%p171, %r547, %r548, %r543, %r545;
mov.b32 %f420, %r549;
add.f32 %f421, %f419, %f420;
mov.b32 %r550, %f421;
mov.u32 %r551, 4;
shfl.sync.bfly.b32 %r552|%p172, %r550, %r551, %r543, %r545;
mov.b32 %f422, %r552;
add.f32 %f423, %f421, %f422;
mov.b32 %r553, %f423;
mov.u32 %r554, 2;
shfl.sync.bfly.b32 %r555|%p173, %r553, %r554, %r543, %r545;
mov.b32 %f424, %r555;
add.f32 %f425, %f423, %f424;
mov.b32 %r556, %f425;
mov.u32 %r557, 1;
shfl.sync.bfly.b32 %r558|%p174, %r556, %r557, %r543, %r545;
mov.b32 %f426, %r558;
add.f32 %f479, %f425, %f426;
$L__BB0_180:
bar.sync 0;
setp.ne.s32 %p175, %r5, 0;
@%p175 bra $L__BB0_183;
mul.lo.s32 %r116, %r78, %r587;
add.s32 %r559, %r77, %r116;
setp.ge.s32 %p176, %r559, %r155;
@%p176 bra $L__BB0_183;
ld.param.u64 %rd135, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r562, %r79, %r116;
mul.wide.s32 %rd133, %r562, 4;
add.s64 %rd132, %rd135, %rd133;
add.f32 %f427, %f479, 0f00000000;
mov.b32 %r563, %f427;
selp.b32 %r561, %r563, 0, %p162;
// begin inline asm
st.global.cs.v2.s32 [%rd132], {%r115,%r561};
// end inline asm
bra.uni $L__BB0_183;
$L__BB0_137:
setp.lt.s32 %p108, %r74, 1;
@%p108 bra $L__BB0_161;
div.s32 %r84, %r75, %r2;
mad.lo.s32 %r85, %r155, %r5, %r76;
shl.b32 %r86, %r71, 1;
shl.b32 %r87, %r8, 1;
mul.lo.s32 %r88, %r155, %r2;
mov.u32 %r588, 0;
$L__BB0_139:
.pragma "nounroll";
setp.lt.s32 %p109, %r84, 1;
mov.f32 %f466, 0f00000000;
mov.f32 %f467, %f466;
@%p109 bra $L__BB0_145;
mad.lo.s32 %r90, %r78, %r588, %r77;
mad.lo.s32 %r400, %r87, %r588, %r86;
mad.lo.s32 %r590, %r3, %r400, %r85;
mov.u32 %r399, 0;
mov.f32 %f466, 0f00000000;
mov.u32 %r589, %r5;
mov.u32 %r591, %r399;
$L__BB0_141:
.pragma "nounroll";
setp.ge.s32 %p110, %r90, %r155;
mov.u32 %r592, %r399;
mov.u32 %r593, %r399;
@%p110 bra $L__BB0_144;
setp.ge.s32 %p111, %r589, %r8;
mov.u32 %r592, %r399;
mov.u32 %r593, %r399;
@%p111 bra $L__BB0_144;
mul.wide.s32 %rd127, %r590, 4;
add.s64 %rd126, %rd41, %rd127;
// begin inline asm
ld.volatile.global.v2.s32 {%r593,%r592}, [%rd126];
// end inline asm
$L__BB0_144:
mov.b32 %f340, %r593;
add.f32 %f467, %f467, %f340;
mov.b32 %f341, %r592;
add.f32 %f466, %f466, %f341;
add.s32 %r590, %r590, %r88;
add.s32 %r589, %r589, %r2;
add.s32 %r591, %r591, 1;
setp.lt.s32 %p112, %r591, %r84;
@%p112 bra $L__BB0_141;
$L__BB0_145:
mov.b32 %r407, %f467;
mov.u32 %r408, 31;
mov.u32 %r409, 16;
mov.u32 %r410, -1;
shfl.sync.bfly.b32 %r411|%p113, %r407, %r409, %r408, %r410;
mov.b32 %f342, %r411;
add.f32 %f343, %f467, %f342;
mov.b32 %r412, %f343;
mov.u32 %r413, 8;
shfl.sync.bfly.b32 %r414|%p114, %r412, %r413, %r408, %r410;
mov.b32 %f344, %r414;
add.f32 %f345, %f343, %f344;
mov.b32 %r415, %f345;
mov.u32 %r416, 4;
shfl.sync.bfly.b32 %r417|%p115, %r415, %r416, %r408, %r410;
mov.b32 %f346, %r417;
add.f32 %f347, %f345, %f346;
mov.b32 %r418, %f347;
mov.u32 %r419, 2;
shfl.sync.bfly.b32 %r420|%p116, %r418, %r419, %r408, %r410;
mov.b32 %f348, %r420;
add.f32 %f349, %f347, %f348;
mov.b32 %r421, %f349;
mov.u32 %r422, 1;
shfl.sync.bfly.b32 %r423|%p117, %r421, %r422, %r408, %r410;
mov.b32 %f350, %r423;
add.f32 %f469, %f349, %f350;
bar.sync 0;
setp.ne.s32 %p118, %r82, 0;
@%p118 bra $L__BB0_147;
st.shared.f32 [%rd29], %f469;
$L__BB0_147:
setp.ne.s32 %p119, %r81, 0;
bar.sync 0;
@%p119 bra $L__BB0_151;
setp.ge.u32 %p120, %r82, %r80;
mov.f32 %f468, 0f00000000;
@%p120 bra $L__BB0_150;
ld.shared.f32 %f468, [%rd30];
$L__BB0_150:
mov.b32 %r424, %f468;
mov.u32 %r425, 31;
mov.u32 %r426, 16;
mov.u32 %r427, -1;
shfl.sync.bfly.b32 %r428|%p121, %r424, %r426, %r425, %r427;
mov.b32 %f352, %r428;
add.f32 %f353, %f468, %f352;
mov.b32 %r429, %f353;
mov.u32 %r430, 8;
shfl.sync.bfly.b32 %r431|%p122, %r429, %r430, %r425, %r427;
mov.b32 %f354, %r431;
add.f32 %f355, %f353, %f354;
mov.b32 %r432, %f355;
mov.u32 %r433, 4;
shfl.sync.bfly.b32 %r434|%p123, %r432, %r433, %r425, %r427;
mov.b32 %f356, %r434;
add.f32 %f357, %f355, %f356;
mov.b32 %r435, %f357;
mov.u32 %r436, 2;
shfl.sync.bfly.b32 %r437|%p124, %r435, %r436, %r425, %r427;
mov.b32 %f358, %r437;
add.f32 %f359, %f357, %f358;
mov.b32 %r438, %f359;
mov.u32 %r439, 1;
shfl.sync.bfly.b32 %r440|%p125, %r438, %r439, %r425, %r427;
mov.b32 %f360, %r440;
add.f32 %f469, %f359, %f360;
$L__BB0_151:
add.f32 %f361, %f469, 0f00000000;
mov.b32 %r441, %f361;
setp.eq.s32 %p127, %r82, 0;
selp.b32 %r102, %r441, 0, %p127;
bar.sync 0;
mov.b32 %r442, %f466;
mov.u32 %r443, 31;
mov.u32 %r444, 16;
mov.u32 %r445, -1;
shfl.sync.bfly.b32 %r446|%p128, %r442, %r444, %r443, %r445;
mov.b32 %f362, %r446;
add.f32 %f363, %f466, %f362;
mov.b32 %r447, %f363;
mov.u32 %r448, 8;
shfl.sync.bfly.b32 %r449|%p129, %r447, %r448, %r443, %r445;
mov.b32 %f364, %r449;
add.f32 %f365, %f363, %f364;
mov.b32 %r450, %f365;
mov.u32 %r451, 4;
shfl.sync.bfly.b32 %r452|%p130, %r450, %r451, %r443, %r445;
mov.b32 %f366, %r452;
add.f32 %f367, %f365, %f366;
mov.b32 %r453, %f367;
mov.u32 %r454, 2;
shfl.sync.bfly.b32 %r455|%p131, %r453, %r454, %r443, %r445;
mov.b32 %f368, %r455;
add.f32 %f369, %f367, %f368;
mov.b32 %r456, %f369;
mov.u32 %r457, 1;
shfl.sync.bfly.b32 %r458|%p132, %r456, %r457, %r443, %r445;
mov.b32 %f370, %r458;
add.f32 %f471, %f369, %f370;
bar.sync 0;
@%p118 bra $L__BB0_153;
st.shared.f32 [%rd29], %f471;
$L__BB0_153:
bar.sync 0;
@%p119 bra $L__BB0_157;
setp.ge.u32 %p134, %r82, %r80;
mov.f32 %f470, 0f00000000;
@%p134 bra $L__BB0_156;
ld.shared.f32 %f470, [%rd30];
$L__BB0_156:
mov.b32 %r459, %f470;
mov.u32 %r460, 31;
mov.u32 %r461, 16;
mov.u32 %r462, -1;
shfl.sync.bfly.b32 %r463|%p135, %r459, %r461, %r460, %r462;
mov.b32 %f372, %r463;
add.f32 %f373, %f470, %f372;
mov.b32 %r464, %f373;
mov.u32 %r465, 8;
shfl.sync.bfly.b32 %r466|%p136, %r464, %r465, %r460, %r462;
mov.b32 %f374, %r466;
add.f32 %f375, %f373, %f374;
mov.b32 %r467, %f375;
mov.u32 %r468, 4;
shfl.sync.bfly.b32 %r469|%p137, %r467, %r468, %r460, %r462;
mov.b32 %f376, %r469;
add.f32 %f377, %f375, %f376;
mov.b32 %r470, %f377;
mov.u32 %r471, 2;
shfl.sync.bfly.b32 %r472|%p138, %r470, %r471, %r460, %r462;
mov.b32 %f378, %r472;
add.f32 %f379, %f377, %f378;
mov.b32 %r473, %f379;
mov.u32 %r474, 1;
shfl.sync.bfly.b32 %r475|%p139, %r473, %r474, %r460, %r462;
mov.b32 %f380, %r475;
add.f32 %f471, %f379, %f380;
$L__BB0_157:
bar.sync 0;
setp.ne.s32 %p140, %r5, 0;
@%p140 bra $L__BB0_160;
mul.lo.s32 %r103, %r78, %r588;
add.s32 %r476, %r77, %r103;
setp.ge.s32 %p141, %r476, %r155;
@%p141 bra $L__BB0_160;
ld.param.u64 %rd134, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_be9b4d4d_1033910nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r479, %r79, %r103;
mul.wide.s32 %rd129, %r479, 4;
add.s64 %rd128, %rd134, %rd129;
add.f32 %f381, %f471, 0f00000000;
mov.b32 %r480, %f381;
selp.b32 %r478, %r480, 0, %p127;
// begin inline asm
st.global.cs.v2.s32 [%rd128], {%r102,%r478};
// end inline asm
$L__BB0_160:
add.s32 %r588, %r588, 1;
setp.lt.s32 %p143, %r588, %r74;
@%p143 bra $L__BB0_139;
$L__BB0_161:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<184>;
.reg .f32 %f<480>;
.reg .b32 %r<595>;
.reg .f64 %fd<3>;
.reg .b64 %rd<137>;
ld.param.v2.u32 {%r154, %r155}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r164, %r165}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r168, %r169}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r190, %r155, 3;
shr.s32 %r191, %r190, 31;
shr.u32 %r192, %r191, 30;
add.s32 %r193, %r190, %r192;
shr.s32 %r2, %r193, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r194, %r2, %r3;
add.s32 %r195, %r194, 31;
shr.s32 %r196, %r195, 31;
shr.u32 %r197, %r196, 27;
add.s32 %r198, %r195, %r197;
shr.u32 %r199, %r198, 5;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r200, %r4, %r199;
shl.b32 %r201, %r200, 7;
cvt.u64.u32 %rd1, %r201;
mul.lo.s32 %r202, %r4, %r2;
shl.b32 %r203, %r202, 4;
or.b32 %r204, %r203, 15;
and.b32 %r5, %r204, -16;
add.s32 %r205, %r204, %r5;
and.b32 %r206, %r205, -16;
cvt.s64.s32 %rd2, %r206;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p6, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r207, %r7, 3;
setp.lt.s32 %p7, %r207, %r155;
and.pred %p1, %p7, %p6;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r208, smem_ptr; }
// end inline asm
shl.b32 %r211, %r6, 4;
add.s32 %r209, %r208, %r211;
mul.wide.s32 %rd48, %r7, 4;
add.s64 %rd47, %rd37, %rd48;
mov.u32 %r210, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r210, 0;
cp.async.ca.shared.global [%r209], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r212, %r4, 215;
div.s32 %r213, %r212, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r214, %r9, %r213;
add.s32 %r215, %r214, -1;
div.s32 %r10, %r215, %r9;
setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r155;
cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
mov.u32 %r217, %ctaid.y;
mul.lo.s32 %r218, %r10, %r4;
mul.lo.s32 %r11, %r218, %r217;
mad.lo.s32 %r219, %r2, %r8, %r6;
shl.b32 %r12, %r219, 4;
mul.lo.s32 %r220, %r155, %r8;
cvt.s64.s32 %rd53, %r220;
cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r221, %r11, %r155;
cvt.s64.s32 %rd6, %r221;
mul.lo.s32 %r13, %r155, %r4;
mul.lo.s32 %r14, %r10, %r217;
shl.b32 %r222, %r8, 2;
mad.lo.s32 %r223, %r222, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r223, 4;
add.s64 %rd7, %rd55, %rd56;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r224, %tid.z;
mad.lo.s32 %r225, %r224, %r4, %r8;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r226, %r225, %r15;
shr.u32 %r16, %r6, 5;
add.s32 %r227, %r226, %r16;
mul.wide.u32 %rd57, %r227, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
and.b32 %r17, %r6, 31;
add.s32 %r228, %r226, %r17;
mul.wide.u32 %rd58, %r228, 4;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r225, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
mov.u32 %r563, 0;
mov.f32 %f438, 0f00000000;
not.pred %p11, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r231, smem_ptr; }
// end inline asm
add.s32 %r232, %r231, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
add.s32 %r258, %r257, %r12;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
mad.lo.s32 %r229, %r563, %r4, %r8;
add.s32 %r230, %r229, %r11;
setp.gt.s32 %p12, %r230, 215;
@%p12 bra $L__BB0_8;
mul.lo.s32 %r234, %r13, %r563;
cvt.s64.s32 %rd65, %r234;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd33, %rd68;
mov.u32 %r233, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r233, 0;
cp.async.ca.shared.global [%r232], [%rd64], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p11 bra $L__BB0_10;
add.s32 %r235, %r14, %r563;
mad.lo.s32 %r236, %r235, %r4, %r8;
setp.lt.s32 %p14, %r236, 216;
@%p14 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r245, %r14, %r563;
mad.lo.s32 %r246, %r245, %r4, %r8;
setp.gt.s32 %p15, %r246, 215;
mov.u32 %r564, 0;
mov.u32 %r565, %r564;
mov.u32 %r566, %r564;
mov.u32 %r567, %r564;
@%p15 bra $L__BB0_15;
ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r564, 0;
mov.u32 %r565, %r564;
mov.u32 %r566, %r564;
mov.u32 %r567, %r564;
$L__BB0_15:
add.s32 %r255, %r14, %r563;
mad.lo.s32 %r33, %r255, %r4, %r8;
mov.b32 %f125, %r567;
add.f32 %f455, %f455, %f125;
mov.b32 %f126, %r566;
add.f32 %f454, %f454, %f126;
mov.b32 %f127, %r565;
add.f32 %f453, %f453, %f127;
mov.b32 %f128, %r564;
add.f32 %f452, %f452, %f128;
setp.gt.s32 %p16, %r33, 215;
mov.f32 %f436, 0f00000000;
@%p16 bra $L__BB0_17;
mul.lo.s32 %r256, %r33, %r164;
mul.wide.s32 %rd69, %r256, 4;
add.s64 %rd70, %rd15, %rd69;
ld.global.f32 %f436, [%rd70];
$L__BB0_17:
setp.lt.s32 %p17, %r33, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_19;
mul.lo.s32 %r260, %r13, %r563;
cvt.s64.s32 %rd73, %r260;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd34, %rd76;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd72], 16, p0;
}
// end inline asm
$L__BB0_19:
mov.f32 %f442, 0f00000000;
mov.f32 %f437, %f442;
@%p16 bra $L__BB0_21;
mul.lo.s32 %r261, %r33, %r168;
mul.wide.s32 %rd77, %r261, 4;
add.s64 %rd78, %rd16, %rd77;
ld.global.f32 %f437, [%rd78];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f443, %f442;
@%p18 bra $L__BB0_23;
ld.shared.v4.f32 {%f132, %f133, %f134, %f135}, [%rd11];
ld.shared.v4.f32 {%f137, %f138, %f139, %f140}, [%rd7];
mul.f32 %f142, %f132, %f137;
add.f32 %f143, %f142, 0f00000000;
ld.shared.v4.f32 {%f144, %f145, %f146, %f147}, [%rd9];
sub.f32 %f149, %f144, %f436;
mul.f32 %f150, %f437, %f149;
fma.rn.f32 %f151, %f142, %f150, 0f00000000;
fma.rn.f32 %f438, %f150, %f137, %f438;
mul.f32 %f154, %f133, %f138;
add.f32 %f155, %f143, %f154;
sub.f32 %f157, %f145, %f436;
mul.f32 %f158, %f437, %f157;
fma.rn.f32 %f159, %f154, %f158, %f151;
fma.rn.f32 %f439, %f158, %f138, %f439;
mul.f32 %f162, %f134, %f139;
add.f32 %f163, %f155, %f162;
sub.f32 %f165, %f146, %f436;
mul.f32 %f166, %f437, %f165;
fma.rn.f32 %f167, %f162, %f166, %f159;
fma.rn.f32 %f440, %f166, %f139, %f440;
mul.f32 %f170, %f135, %f140;
add.f32 %f443, %f163, %f170;
sub.f32 %f172, %f147, %f436;
mul.f32 %f173, %f437, %f172;
fma.rn.f32 %f442, %f170, %f173, %f167;
fma.rn.f32 %f441, %f173, %f140, %f441;
$L__BB0_23:
mov.b32 %r262, %f443;
mov.u32 %r263, 31;
mov.u32 %r264, 16;
mov.u32 %r265, -1;
shfl.sync.bfly.b32 %r266|%p21, %r262, %r264, %r263, %r265;
mov.b32 %f174, %r266;
add.f32 %f175, %f443, %f174;
mov.b32 %r267, %f175;
mov.u32 %r268, 8;
shfl.sync.bfly.b32 %r269|%p22, %r267, %r268, %r263, %r265;
mov.b32 %f176, %r269;
add.f32 %f177, %f175, %f176;
mov.b32 %r270, %f177;
mov.u32 %r271, 4;
shfl.sync.bfly.b32 %r272|%p23, %r270, %r271, %r263, %r265;
mov.b32 %f178, %r272;
add.f32 %f179, %f177, %f178;
mov.b32 %r273, %f179;
mov.u32 %r274, 2;
shfl.sync.bfly.b32 %r275|%p24, %r273, %r274, %r263, %r265;
mov.b32 %f180, %r275;
add.f32 %f181, %f179, %f180;
mov.b32 %r276, %f181;
mov.u32 %r277, 1;
shfl.sync.bfly.b32 %r278|%p25, %r276, %r277, %r263, %r265;
mov.b32 %f182, %r278;
add.f32 %f445, %f181, %f182;
bar.sync 0;
setp.ne.s32 %p26, %r17, 0;
@%p26 bra $L__BB0_25;
st.shared.f32 [%rd8], %f445;
$L__BB0_25:
setp.ne.s32 %p27, %r16, 0;
bar.sync 0;
@%p27 bra $L__BB0_29;
setp.ge.u32 %p28, %r17, %r15;
mov.f32 %f444, 0f00000000;
@%p28 bra $L__BB0_28;
ld.shared.f32 %f444, [%rd10];
$L__BB0_28:
mov.b32 %r279, %f444;
mov.u32 %r280, 31;
mov.u32 %r281, 16;
mov.u32 %r282, -1;
shfl.sync.bfly.b32 %r283|%p29, %r279, %r281, %r280, %r282;
mov.b32 %f184, %r283;
add.f32 %f185, %f444, %f184;
mov.b32 %r284, %f185;
mov.u32 %r285, 8;
shfl.sync.bfly.b32 %r286|%p30, %r284, %r285, %r280, %r282;
mov.b32 %f186, %r286;
add.f32 %f187, %f185, %f186;
mov.b32 %r287, %f187;
mov.u32 %r288, 4;
shfl.sync.bfly.b32 %r289|%p31, %r287, %r288, %r280, %r282;
mov.b32 %f188, %r289;
add.f32 %f189, %f187, %f188;
mov.b32 %r290, %f189;
mov.u32 %r291, 2;
shfl.sync.bfly.b32 %r292|%p32, %r290, %r291, %r280, %r282;
mov.b32 %f190, %r292;
add.f32 %f191, %f189, %f190;
mov.b32 %r293, %f191;
mov.u32 %r294, 1;
shfl.sync.bfly.b32 %r295|%p33, %r293, %r294, %r280, %r282;
mov.b32 %f192, %r295;
add.f32 %f445, %f191, %f192;
$L__BB0_29:
setp.ne.s32 %p181, %r17, 0;
bar.sync 0;
mov.b32 %r296, %f442;
mov.u32 %r297, 31;
mov.u32 %r298, 16;
mov.u32 %r299, -1;
shfl.sync.bfly.b32 %r300|%p34, %r296, %r298, %r297, %r299;
mov.b32 %f193, %r300;
add.f32 %f194, %f442, %f193;
mov.b32 %r301, %f194;
mov.u32 %r302, 8;
shfl.sync.bfly.b32 %r303|%p35, %r301, %r302, %r297, %r299;
mov.b32 %f195, %r303;
add.f32 %f196, %f194, %f195;
mov.b32 %r304, %f196;
mov.u32 %r305, 4;
shfl.sync.bfly.b32 %r306|%p36, %r304, %r305, %r297, %r299;
mov.b32 %f197, %r306;
add.f32 %f198, %f196, %f197;
mov.b32 %r307, %f198;
mov.u32 %r308, 2;
shfl.sync.bfly.b32 %r309|%p37, %r307, %r308, %r297, %r299;
mov.b32 %f199, %r309;
add.f32 %f200, %f198, %f199;
mov.b32 %r310, %f200;
mov.u32 %r311, 1;
shfl.sync.bfly.b32 %r312|%p38, %r310, %r311, %r297, %r299;
mov.b32 %f201, %r312;
add.f32 %f447, %f200, %f201;
bar.sync 0;
@%p181 bra $L__BB0_31;
st.shared.f32 [%rd8], %f447;
$L__BB0_31:
setp.eq.s32 %p183, %r17, 0;
setp.ne.s32 %p178, %r16, 0;
bar.sync 0;
add.f32 %f202, %f445, 0f00000000;
selp.f32 %f37, %f202, 0f00000000, %p183;
@%p178 bra $L__BB0_35;
setp.ge.u32 %p41, %r17, %r15;
mov.f32 %f446, 0f00000000;
@%p41 bra $L__BB0_34;
ld.shared.f32 %f446, [%rd10];
$L__BB0_34:
mov.b32 %r313, %f446;
mov.u32 %r314, 31;
mov.u32 %r315, 16;
mov.u32 %r316, -1;
shfl.sync.bfly.b32 %r317|%p42, %r313, %r315, %r314, %r316;
mov.b32 %f204, %r317;
add.f32 %f205, %f446, %f204;
mov.b32 %r318, %f205;
mov.u32 %r319, 8;
shfl.sync.bfly.b32 %r320|%p43, %r318, %r319, %r314, %r316;
mov.b32 %f206, %r320;
add.f32 %f207, %f205, %f206;
mov.b32 %r321, %f207;
mov.u32 %r322, 4;
shfl.sync.bfly.b32 %r323|%p44, %r321, %r322, %r314, %r316;
mov.b32 %f208, %r323;
add.f32 %f209, %f207, %f208;
mov.b32 %r324, %f209;
mov.u32 %r325, 2;
shfl.sync.bfly.b32 %r326|%p45, %r324, %r325, %r314, %r316;
mov.b32 %f210, %r326;
add.f32 %f211, %f209, %f210;
mov.b32 %r327, %f211;
mov.u32 %r328, 1;
shfl.sync.bfly.b32 %r329|%p46, %r327, %r328, %r314, %r316;
mov.b32 %f212, %r329;
add.f32 %f447, %f211, %f212;
$L__BB0_35:
bar.sync 0;
setp.ne.s32 %p47, %r6, 0;
@%p47 bra $L__BB0_37;
st.shared.f32 [%rd12], %f37;
$L__BB0_37:
bar.sync 0;
ld.shared.f32 %f42, [%rd12];
bar.sync 0;
@%p47 bra $L__BB0_39;
setp.eq.s32 %p182, %r17, 0;
add.f32 %f213, %f447, 0f00000000;
selp.f32 %f214, %f213, 0f00000000, %p182;
st.shared.f32 [%rd12], %f214;
$L__BB0_39:
bar.sync 0;
ld.shared.f32 %f43, [%rd12];
bar.sync 0;
@%p18 bra $L__BB0_41;
mul.f32 %f215, %f437, %f1;
ld.shared.v4.f32 {%f216, %f217, %f218, %f219}, [%rd11];
ld.shared.v4.f32 {%f221, %f222, %f223, %f224}, [%rd7];
mul.f32 %f226, %f216, %f221;
mul.f32 %f227, %f226, %f2;
ld.shared.v4.f32 {%f228, %f229, %f230, %f231}, [%rd9];
sub.f32 %f233, %f228, %f436;
mul.f32 %f234, %f437, %f233;
sub.f32 %f235, %f227, %f42;
mul.f32 %f236, %f43, %f234;
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f215, %f237;
mov.b32 %r330, %f238;
mul.f32 %f241, %f217, %f222;
mul.f32 %f242, %f241, %f2;
sub.f32 %f244, %f229, %f436;
mul.f32 %f245, %f437, %f244;
sub.f32 %f246, %f242, %f42;
mul.f32 %f247, %f43, %f245;
sub.f32 %f248, %f246, %f247;
mul.f32 %f249, %f215, %f248;
mov.b32 %r331, %f249;
mul.f32 %f252, %f218, %f223;
mul.f32 %f253, %f252, %f2;
sub.f32 %f255, %f230, %f436;
mul.f32 %f256, %f437, %f255;
sub.f32 %f257, %f253, %f42;
mul.f32 %f258, %f43, %f256;
sub.f32 %f259, %f257, %f258;
mul.f32 %f260, %f215, %f259;
mov.b32 %r332, %f260;
mul.f32 %f263, %f219, %f224;
mul.f32 %f264, %f263, %f2;
sub.f32 %f266, %f231, %f436;
mul.f32 %f267, %f437, %f266;
sub.f32 %f268, %f264, %f42;
mul.f32 %f269, %f43, %f267;
sub.f32 %f270, %f268, %f269;
mul.f32 %f271, %f215, %f270;
mov.b32 %r333, %f271;
mad.lo.s32 %r334, %r33, %r155, %r7;
mul.wide.s32 %rd80, %r334, 4;
add.s64 %rd79, %rd38, %rd80;
// begin inline asm
st.global.cs.v4.s32 [%rd79], {%r330,%r331,%r332,%r333};
// end inline asm
$L__BB0_41:
add.s32 %r563, %r563, 1;
setp.lt.s32 %p51, %r563, %r10;
@%p51 bra $L__BB0_5;
bra.uni $L__BB0_42;
$L__BB0_3:
mov.f32 %f438, 0f00000000;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_42:
mov.u32 %r335, %tid.z;
mad.lo.s32 %r35, %r335, %r4, %r8;
mad.lo.s32 %r36, %r35, %r3, %r6;
mul.wide.u32 %rd81, %r36, 4;
add.s64 %rd22, %rd44, %rd81;
clz.b32 %r336, %r4;
mov.u32 %r337, 31;
sub.s32 %r338, %r337, %r336;
mov.u32 %r339, 1;
shl.b32 %r37, %r339, %r338;
setp.lt.u32 %p52, %r8, %r37;
add.s32 %r340, %r37, %r8;
setp.lt.u32 %p53, %r340, %r4;
and.pred %p5, %p52, %p53;
shl.b32 %r341, %r3, %r338;
add.s32 %r342, %r36, %r341;
mul.wide.s32 %rd83, %r342, 4;
add.s64 %rd23, %rd44, %rd83;
shr.u32 %r343, %r37, 31;
add.s32 %r344, %r37, %r343;
shr.s32 %r582, %r344, 1;
st.shared.f32 [%rd22], %f438;
bar.sync 0;
not.pred %p54, %p5;
@%p54 bra $L__BB0_44;
ld.shared.f32 %f272, [%rd23];
ld.shared.f32 %f273, [%rd22];
add.f32 %f274, %f272, %f273;
st.shared.f32 [%rd22], %f274;
$L__BB0_44:
setp.lt.s32 %p55, %r37, 4;
bar.sync 0;
@%p55 bra $L__BB0_49;
mov.u32 %r568, %r582;
$L__BB0_46:
setp.ge.u32 %p56, %r8, %r568;
@%p56 bra $L__BB0_48;
mad.lo.s32 %r345, %r568, %r3, %r36;
mul.wide.s32 %rd84, %r345, 4;
add.s64 %rd86, %rd44, %rd84;
ld.shared.f32 %f275, [%rd22];
ld.shared.f32 %f276, [%rd86];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd22], %f277;
$L__BB0_48:
bar.sync 0;
shr.u32 %r40, %r568, 1;
setp.gt.u32 %p57, %r568, 3;
mov.u32 %r568, %r40;
@%p57 bra $L__BB0_46;
$L__BB0_49:
add.s32 %r347, %r36, %r3;
mul.wide.u32 %rd87, %r347, 4;
add.s64 %rd24, %rd44, %rd87;
setp.ne.s32 %p58, %r8, 0;
mov.u32 %r569, 0;
@%p58 bra $L__BB0_53;
setp.lt.u32 %p59, %r4, 2;
ld.shared.f32 %f278, [%rd22];
add.f32 %f456, %f278, 0f00000000;
@%p59 bra $L__BB0_52;
ld.shared.f32 %f279, [%rd24];
add.f32 %f456, %f456, %f279;
$L__BB0_52:
mov.b32 %r569, %f456;
$L__BB0_53:
bar.sync 0;
st.shared.f32 [%rd22], %f439;
bar.sync 0;
@%p54 bra $L__BB0_55;
ld.shared.f32 %f280, [%rd23];
ld.shared.f32 %f281, [%rd22];
add.f32 %f282, %f280, %f281;
st.shared.f32 [%rd22], %f282;
$L__BB0_55:
bar.sync 0;
@%p55 bra $L__BB0_60;
mov.u32 %r570, %r582;
$L__BB0_57:
setp.ge.u32 %p62, %r8, %r570;
@%p62 bra $L__BB0_59;
mad.lo.s32 %r348, %r570, %r3, %r36;
mul.wide.s32 %rd89, %r348, 4;
add.s64 %rd91, %rd44, %rd89;
ld.shared.f32 %f283, [%rd22];
ld.shared.f32 %f284, [%rd91];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd22], %f285;
$L__BB0_59:
bar.sync 0;
shr.u32 %r44, %r570, 1;
setp.gt.u32 %p63, %r570, 3;
mov.u32 %r570, %r44;
@%p63 bra $L__BB0_57;
$L__BB0_60:
mov.u32 %r571, 0;
@%p58 bra $L__BB0_64;
setp.lt.u32 %p65, %r4, 2;
ld.shared.f32 %f286, [%rd22];
add.f32 %f457, %f286, 0f00000000;
@%p65 bra $L__BB0_63;
ld.shared.f32 %f287, [%rd24];
add.f32 %f457, %f457, %f287;
$L__BB0_63:
mov.b32 %r571, %f457;
$L__BB0_64:
bar.sync 0;
st.shared.f32 [%rd22], %f440;
bar.sync 0;
@%p54 bra $L__BB0_66;
ld.shared.f32 %f288, [%rd23];
ld.shared.f32 %f289, [%rd22];
add.f32 %f290, %f288, %f289;
st.shared.f32 [%rd22], %f290;
$L__BB0_66:
bar.sync 0;
@%p55 bra $L__BB0_71;
mov.u32 %r572, %r582;
$L__BB0_68:
setp.ge.u32 %p68, %r8, %r572;
@%p68 bra $L__BB0_70;
mad.lo.s32 %r350, %r572, %r3, %r36;
mul.wide.s32 %rd92, %r350, 4;
add.s64 %rd94, %rd44, %rd92;
ld.shared.f32 %f291, [%rd22];
ld.shared.f32 %f292, [%rd94];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd22], %f293;
$L__BB0_70:
bar.sync 0;
shr.u32 %r48, %r572, 1;
setp.gt.u32 %p69, %r572, 3;
mov.u32 %r572, %r48;
@%p69 bra $L__BB0_68;
$L__BB0_71:
mov.u32 %r573, 0;
@%p58 bra $L__BB0_75;
setp.lt.u32 %p71, %r4, 2;
ld.shared.f32 %f294, [%rd22];
add.f32 %f458, %f294, 0f00000000;
@%p71 bra $L__BB0_74;
ld.shared.f32 %f295, [%rd24];
add.f32 %f458, %f458, %f295;
$L__BB0_74:
mov.b32 %r573, %f458;
$L__BB0_75:
bar.sync 0;
st.shared.f32 [%rd22], %f441;
bar.sync 0;
@%p54 bra $L__BB0_77;
ld.shared.f32 %f296, [%rd23];
ld.shared.f32 %f297, [%rd22];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd22], %f298;
$L__BB0_77:
bar.sync 0;
@%p55 bra $L__BB0_82;
mov.u32 %r574, %r582;
$L__BB0_79:
setp.ge.u32 %p74, %r8, %r574;
@%p74 bra $L__BB0_81;
mad.lo.s32 %r352, %r574, %r3, %r36;
mul.wide.s32 %rd95, %r352, 4;
add.s64 %rd97, %rd44, %rd95;
ld.shared.f32 %f299, [%rd22];
ld.shared.f32 %f300, [%rd97];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd22], %f301;
$L__BB0_81:
bar.sync 0;
shr.u32 %r52, %r574, 1;
setp.gt.u32 %p75, %r574, 3;
mov.u32 %r574, %r52;
@%p75 bra $L__BB0_79;
$L__BB0_82:
mov.u32 %r575, 0;
@%p58 bra $L__BB0_86;
setp.lt.u32 %p77, %r4, 2;
ld.shared.f32 %f302, [%rd22];
add.f32 %f459, %f302, 0f00000000;
@%p77 bra $L__BB0_85;
ld.shared.f32 %f303, [%rd24];
add.f32 %f459, %f459, %f303;
$L__BB0_85:
mov.b32 %r575, %f459;
$L__BB0_86:
bar.sync 0;
st.shared.f32 [%rd22], %f452;
bar.sync 0;
@%p54 bra $L__BB0_88;
ld.shared.f32 %f304, [%rd23];
ld.shared.f32 %f305, [%rd22];
add.f32 %f306, %f304, %f305;
st.shared.f32 [%rd22], %f306;
$L__BB0_88:
bar.sync 0;
@%p55 bra $L__BB0_93;
mov.u32 %r576, %r582;
$L__BB0_90:
setp.ge.u32 %p80, %r8, %r576;
@%p80 bra $L__BB0_92;
mad.lo.s32 %r354, %r576, %r3, %r36;
mul.wide.s32 %rd98, %r354, 4;
add.s64 %rd100, %rd44, %rd98;
ld.shared.f32 %f307, [%rd22];
ld.shared.f32 %f308, [%rd100];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd22], %f309;
$L__BB0_92:
bar.sync 0;
shr.u32 %r56, %r576, 1;
setp.gt.u32 %p81, %r576, 3;
mov.u32 %r576, %r56;
@%p81 bra $L__BB0_90;
$L__BB0_93:
mov.u32 %r577, 0;
@%p58 bra $L__BB0_97;
setp.lt.u32 %p83, %r4, 2;
ld.shared.f32 %f310, [%rd22];
add.f32 %f460, %f310, 0f00000000;
@%p83 bra $L__BB0_96;
ld.shared.f32 %f311, [%rd24];
add.f32 %f460, %f460, %f311;
$L__BB0_96:
mov.b32 %r577, %f460;
$L__BB0_97:
bar.sync 0;
st.shared.f32 [%rd22], %f453;
bar.sync 0;
@%p54 bra $L__BB0_99;
ld.shared.f32 %f312, [%rd23];
ld.shared.f32 %f313, [%rd22];
add.f32 %f314, %f312, %f313;
st.shared.f32 [%rd22], %f314;
$L__BB0_99:
bar.sync 0;
@%p55 bra $L__BB0_104;
mov.u32 %r578, %r582;
$L__BB0_101:
setp.ge.u32 %p86, %r8, %r578;
@%p86 bra $L__BB0_103;
mad.lo.s32 %r356, %r578, %r3, %r36;
mul.wide.s32 %rd101, %r356, 4;
add.s64 %rd103, %rd44, %rd101;
ld.shared.f32 %f315, [%rd22];
ld.shared.f32 %f316, [%rd103];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd22], %f317;
$L__BB0_103:
bar.sync 0;
shr.u32 %r60, %r578, 1;
setp.gt.u32 %p87, %r578, 3;
mov.u32 %r578, %r60;
@%p87 bra $L__BB0_101;
$L__BB0_104:
mov.u32 %r579, 0;
@%p58 bra $L__BB0_108;
setp.lt.u32 %p89, %r4, 2;
ld.shared.f32 %f318, [%rd22];
add.f32 %f461, %f318, 0f00000000;
@%p89 bra $L__BB0_107;
ld.shared.f32 %f319, [%rd24];
add.f32 %f461, %f461, %f319;
$L__BB0_107:
mov.b32 %r579, %f461;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd22], %f454;
bar.sync 0;
@%p54 bra $L__BB0_110;
ld.shared.f32 %f320, [%rd23];
ld.shared.f32 %f321, [%rd22];
add.f32 %f322, %f320, %f321;
st.shared.f32 [%rd22], %f322;
$L__BB0_110:
bar.sync 0;
@%p55 bra $L__BB0_115;
mov.u32 %r580, %r582;
$L__BB0_112:
setp.ge.u32 %p92, %r8, %r580;
@%p92 bra $L__BB0_114;
mad.lo.s32 %r358, %r580, %r3, %r36;
mul.wide.s32 %rd104, %r358, 4;
add.s64 %rd106, %rd44, %rd104;
ld.shared.f32 %f323, [%rd22];
ld.shared.f32 %f324, [%rd106];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd22], %f325;
$L__BB0_114:
bar.sync 0;
shr.u32 %r64, %r580, 1;
setp.gt.u32 %p93, %r580, 3;
mov.u32 %r580, %r64;
@%p93 bra $L__BB0_112;
$L__BB0_115:
mov.u32 %r581, 0;
@%p58 bra $L__BB0_119;
setp.lt.u32 %p95, %r4, 2;
ld.shared.f32 %f326, [%rd22];
add.f32 %f462, %f326, 0f00000000;
@%p95 bra $L__BB0_118;
ld.shared.f32 %f327, [%rd24];
add.f32 %f462, %f462, %f327;
$L__BB0_118:
mov.b32 %r581, %f462;
$L__BB0_119:
bar.sync 0;
st.shared.f32 [%rd22], %f455;
bar.sync 0;
@%p54 bra $L__BB0_121;
ld.shared.f32 %f328, [%rd23];
ld.shared.f32 %f329, [%rd22];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd22], %f330;
$L__BB0_121:
bar.sync 0;
@%p55 bra $L__BB0_125;
$L__BB0_122:
setp.ge.u32 %p98, %r8, %r582;
@%p98 bra $L__BB0_124;
mad.lo.s32 %r360, %r582, %r3, %r36;
mul.wide.s32 %rd107, %r360, 4;
add.s64 %rd109, %rd44, %rd107;
ld.shared.f32 %f331, [%rd22];
ld.shared.f32 %f332, [%rd109];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd22], %f333;
$L__BB0_124:
bar.sync 0;
shr.u32 %r68, %r582, 1;
setp.gt.u32 %p99, %r582, 3;
mov.u32 %r582, %r68;
@%p99 bra $L__BB0_122;
$L__BB0_125:
mov.u32 %r583, 0;
@%p58 bra $L__BB0_129;
setp.lt.u32 %p101, %r4, 2;
ld.shared.f32 %f334, [%rd22];
add.f32 %f463, %f334, 0f00000000;
@%p101 bra $L__BB0_128;
ld.shared.f32 %f335, [%rd24];
add.f32 %f463, %f463, %f335;
$L__BB0_128:
mov.b32 %r583, %f463;
$L__BB0_129:
setp.eq.s32 %p180, %r8, 0;
and.pred %p179, %p180, %p1;
bar.sync 0;
@%p179 bra $L__BB0_130;
bra.uni $L__BB0_131;
$L__BB0_130:
mov.u32 %r370, %ctaid.y;
mad.lo.s32 %r371, %r155, %r370, %r7;
mul.wide.s32 %rd112, %r371, 4;
add.s64 %rd110, %rd41, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd110], {%r569,%r571,%r573,%r575};
// end inline asm
add.s64 %rd111, %rd42, %rd112;
// begin inline asm
st.volatile.global.v4.s32 [%rd111], {%r577,%r579,%r581,%r583};
// end inline asm
$L__BB0_131:
mov.u32 %r71, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r372, %r6, %r8;
or.b32 %r374, %r372, %r335;
setp.ne.s32 %p102, %r374, 0;
@%p102 bra $L__BB0_135;
ld.param.u64 %rd136, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd136;
mov.u32 %r375, %ctaid.x;
mov.u32 %r376, %ctaid.z;
mov.u32 %r377, %nctaid.x;
mad.lo.s32 %r378, %r376, %r377, %r375;
mul.wide.s32 %rd114, %r378, 8;
add.s64 %rd27, %rd113, %rd114;
add.s32 %r379, %r9, -1;
setp.eq.s32 %p103, %r71, %r379;
cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p103;
atom.global.add.u64 %rd28, [%rd27], %rd118;
ld.volatile.global.u64 %rd119, [%rd27];
xor.b64 %rd120, %rd119, %rd28;
setp.lt.s64 %p104, %rd120, 0;
@%p104 bra $L__BB0_135;
mov.u32 %r584, 8;
$L__BB0_134:
// begin inline asm
nanosleep.u32 %r584;
// end inline asm
setp.lt.u32 %p105, %r584, 256;
selp.u32 %r382, 1, 0, %p105;
shl.b32 %r584, %r584, %r382;
ld.volatile.global.u64 %rd121, [%rd27];
xor.b64 %rd122, %rd121, %rd28;
setp.gt.s64 %p106, %rd122, -1;
@%p106 bra $L__BB0_134;
$L__BB0_135:
bar.sync 0;
add.s32 %r384, %r155, 1;
shr.u32 %r385, %r384, 31;
add.s32 %r386, %r384, %r385;
shr.s32 %r387, %r386, 1;
add.s32 %r388, %r4, %r387;
add.s32 %r389, %r388, -1;
div.s32 %r390, %r389, %r4;
add.s32 %r391, %r9, -1;
add.s32 %r392, %r391, %r390;
div.s32 %r74, %r392, %r9;
add.s32 %r75, %r391, %r3;
shl.b32 %r76, %r8, 1;
shl.b32 %r393, %r4, 1;
mad.lo.s32 %r79, %r393, %r71, %r76;
or.b32 %r77, %r79, 1;
mul.lo.s32 %r78, %r393, %r9;
shr.u32 %r80, %r3, 5;
mul.lo.s32 %r394, %r35, %r80;
shr.u32 %r81, %r6, 5;
add.s32 %r395, %r394, %r81;
mul.wide.u32 %rd123, %r395, 4;
add.s64 %rd29, %rd44, %rd123;
and.b32 %r82, %r6, 31;
add.s32 %r396, %r394, %r82;
mul.wide.u32 %rd125, %r396, 4;
add.s64 %rd30, %rd44, %rd125;
mov.u32 %r585, 0;
bra.uni $L__BB0_136;
$L__BB0_183:
add.s32 %r585, %r585, 1;
$L__BB0_136:
.pragma "nounroll";
setp.lt.s32 %p107, %r585, %r74;
@%p107 bra $L__BB0_162;
bra.uni $L__BB0_137;
$L__BB0_162:
div.s32 %r105, %r75, %r3;
setp.lt.s32 %p144, %r105, 1;
mov.f32 %f474, 0f00000000;
mov.f32 %f475, %f474;
@%p144 bra $L__BB0_168;
mul.lo.s32 %r481, %r78, %r585;
add.s32 %r106, %r77, %r481;
add.s32 %r107, %r79, %r481;
mov.u32 %r480, 0;
mov.f32 %f474, 0f00000000;
mov.u32 %r592, %r480;
$L__BB0_164:
.pragma "nounroll";
setp.ge.s32 %p145, %r106, %r155;
mov.u32 %r593, %r480;
mov.u32 %r594, %r480;
@%p145 bra $L__BB0_167;
mad.lo.s32 %r109, %r592, %r3, %r6;
setp.ge.s32 %p146, %r109, %r9;
mov.u32 %r593, %r480;
mov.u32 %r594, %r480;
@%p146 bra $L__BB0_167;
mad.lo.s32 %r488, %r109, %r155, %r107;
mul.wide.s32 %rd131, %r488, 4;
add.s64 %rd130, %rd42, %rd131;
// begin inline asm
ld.volatile.global.v2.s32 {%r594,%r593}, [%rd130];
// end inline asm
$L__BB0_167:
mov.b32 %f386, %r594;
add.f32 %f475, %f475, %f386;
mov.b32 %f387, %r593;
add.f32 %f474, %f474, %f387;
add.s32 %r592, %r592, 1;
setp.lt.s32 %p147, %r592, %r105;
@%p147 bra $L__BB0_164;
$L__BB0_168:
mov.b32 %r489, %f475;
mov.u32 %r490, 31;
mov.u32 %r491, 16;
mov.u32 %r492, -1;
shfl.sync.bfly.b32 %r493|%p148, %r489, %r491, %r490, %r492;
mov.b32 %f388, %r493;
add.f32 %f389, %f475, %f388;
mov.b32 %r494, %f389;
mov.u32 %r495, 8;
shfl.sync.bfly.b32 %r496|%p149, %r494, %r495, %r490, %r492;
mov.b32 %f390, %r496;
add.f32 %f391, %f389, %f390;
mov.b32 %r497, %f391;
mov.u32 %r498, 4;
shfl.sync.bfly.b32 %r499|%p150, %r497, %r498, %r490, %r492;
mov.b32 %f392, %r499;
add.f32 %f393, %f391, %f392;
mov.b32 %r500, %f393;
mov.u32 %r501, 2;
shfl.sync.bfly.b32 %r502|%p151, %r500, %r501, %r490, %r492;
mov.b32 %f394, %r502;
add.f32 %f395, %f393, %f394;
mov.b32 %r503, %f395;
mov.u32 %r504, 1;
shfl.sync.bfly.b32 %r505|%p152, %r503, %r504, %r490, %r492;
mov.b32 %f396, %r505;
add.f32 %f477, %f395, %f396;
bar.sync 0;
setp.ne.s32 %p153, %r82, 0;
@%p153 bra $L__BB0_170;
st.shared.f32 [%rd29], %f477;
$L__BB0_170:
setp.ne.s32 %p154, %r81, 0;
bar.sync 0;
@%p154 bra $L__BB0_174;
setp.ge.u32 %p155, %r82, %r80;
mov.f32 %f476, 0f00000000;
@%p155 bra $L__BB0_173;
ld.shared.f32 %f476, [%rd30];
$L__BB0_173:
mov.b32 %r506, %f476;
mov.u32 %r507, 31;
mov.u32 %r508, 16;
mov.u32 %r509, -1;
shfl.sync.bfly.b32 %r510|%p156, %r506, %r508, %r507, %r509;
mov.b32 %f398, %r510;
add.f32 %f399, %f476, %f398;
mov.b32 %r511, %f399;
mov.u32 %r512, 8;
shfl.sync.bfly.b32 %r513|%p157, %r511, %r512, %r507, %r509;
mov.b32 %f400, %r513;
add.f32 %f401, %f399, %f400;
mov.b32 %r514, %f401;
mov.u32 %r515, 4;
shfl.sync.bfly.b32 %r516|%p158, %r514, %r515, %r507, %r509;
mov.b32 %f402, %r516;
add.f32 %f403, %f401, %f402;
mov.b32 %r517, %f403;
mov.u32 %r518, 2;
shfl.sync.bfly.b32 %r519|%p159, %r517, %r518, %r507, %r509;
mov.b32 %f404, %r519;
add.f32 %f405, %f403, %f404;
mov.b32 %r520, %f405;
mov.u32 %r521, 1;
shfl.sync.bfly.b32 %r522|%p160, %r520, %r521, %r507, %r509;
mov.b32 %f406, %r522;
add.f32 %f477, %f405, %f406;
$L__BB0_174:
add.f32 %f407, %f477, 0f00000000;
mov.b32 %r523, %f407;
setp.eq.s32 %p162, %r82, 0;
selp.b32 %r115, %r523, 0, %p162;
bar.sync 0;
mov.b32 %r524, %f474;
mov.u32 %r525, 31;
mov.u32 %r526, 16;
mov.u32 %r527, -1;
shfl.sync.bfly.b32 %r528|%p163, %r524, %r526, %r525, %r527;
mov.b32 %f408, %r528;
add.f32 %f409, %f474, %f408;
mov.b32 %r529, %f409;
mov.u32 %r530, 8;
shfl.sync.bfly.b32 %r531|%p164, %r529, %r530, %r525, %r527;
mov.b32 %f410, %r531;
add.f32 %f411, %f409, %f410;
mov.b32 %r532, %f411;
mov.u32 %r533, 4;
shfl.sync.bfly.b32 %r534|%p165, %r532, %r533, %r525, %r527;
mov.b32 %f412, %r534;
add.f32 %f413, %f411, %f412;
mov.b32 %r535, %f413;
mov.u32 %r536, 2;
shfl.sync.bfly.b32 %r537|%p166, %r535, %r536, %r525, %r527;
mov.b32 %f414, %r537;
add.f32 %f415, %f413, %f414;
mov.b32 %r538, %f415;
mov.u32 %r539, 1;
shfl.sync.bfly.b32 %r540|%p167, %r538, %r539, %r525, %r527;
mov.b32 %f416, %r540;
add.f32 %f479, %f415, %f416;
bar.sync 0;
@%p153 bra $L__BB0_176;
st.shared.f32 [%rd29], %f479;
$L__BB0_176:
bar.sync 0;
@%p154 bra $L__BB0_180;
setp.ge.u32 %p169, %r82, %r80;
mov.f32 %f478, 0f00000000;
@%p169 bra $L__BB0_179;
ld.shared.f32 %f478, [%rd30];
$L__BB0_179:
mov.b32 %r541, %f478;
mov.u32 %r542, 31;
mov.u32 %r543, 16;
mov.u32 %r544, -1;
shfl.sync.bfly.b32 %r545|%p170, %r541, %r543, %r542, %r544;
mov.b32 %f418, %r545;
add.f32 %f419, %f478, %f418;
mov.b32 %r546, %f419;
mov.u32 %r547, 8;
shfl.sync.bfly.b32 %r548|%p171, %r546, %r547, %r542, %r544;
mov.b32 %f420, %r548;
add.f32 %f421, %f419, %f420;
mov.b32 %r549, %f421;
mov.u32 %r550, 4;
shfl.sync.bfly.b32 %r551|%p172, %r549, %r550, %r542, %r544;
mov.b32 %f422, %r551;
add.f32 %f423, %f421, %f422;
mov.b32 %r552, %f423;
mov.u32 %r553, 2;
shfl.sync.bfly.b32 %r554|%p173, %r552, %r553, %r542, %r544;
mov.b32 %f424, %r554;
add.f32 %f425, %f423, %f424;
mov.b32 %r555, %f425;
mov.u32 %r556, 1;
shfl.sync.bfly.b32 %r557|%p174, %r555, %r556, %r542, %r544;
mov.b32 %f426, %r557;
add.f32 %f479, %f425, %f426;
$L__BB0_180:
bar.sync 0;
setp.ne.s32 %p175, %r6, 0;
@%p175 bra $L__BB0_183;
mul.lo.s32 %r116, %r78, %r585;
add.s32 %r558, %r77, %r116;
setp.ge.s32 %p176, %r558, %r155;
@%p176 bra $L__BB0_183;
ld.param.u64 %rd135, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r561, %r79, %r116;
mul.wide.s32 %rd133, %r561, 4;
add.s64 %rd132, %rd135, %rd133;
add.f32 %f427, %f479, 0f00000000;
mov.b32 %r562, %f427;
selp.b32 %r560, %r562, 0, %p162;
// begin inline asm
st.global.cs.v2.s32 [%rd132], {%r115,%r560};
// end inline asm
bra.uni $L__BB0_183;
$L__BB0_137:
setp.lt.s32 %p108, %r74, 1;
@%p108 bra $L__BB0_161;
div.s32 %r84, %r75, %r3;
mad.lo.s32 %r85, %r155, %r6, %r76;
shl.b32 %r86, %r71, 1;
shl.b32 %r87, %r9, 1;
mul.lo.s32 %r88, %r155, %r3;
mov.u32 %r586, 0;
$L__BB0_139:
.pragma "nounroll";
setp.lt.s32 %p109, %r84, 1;
mov.f32 %f466, 0f00000000;
mov.f32 %f467, %f466;
@%p109 bra $L__BB0_145;
mad.lo.s32 %r90, %r78, %r586, %r77;
mad.lo.s32 %r399, %r87, %r586, %r86;
mad.lo.s32 %r588, %r4, %r399, %r85;
mov.u32 %r398, 0;
mov.f32 %f466, 0f00000000;
mov.u32 %r587, %r6;
mov.u32 %r589, %r398;
$L__BB0_141:
.pragma "nounroll";
setp.ge.s32 %p110, %r90, %r155;
mov.u32 %r590, %r398;
mov.u32 %r591, %r398;
@%p110 bra $L__BB0_144;
setp.ge.s32 %p111, %r587, %r9;
mov.u32 %r590, %r398;
mov.u32 %r591, %r398;
@%p111 bra $L__BB0_144;
mul.wide.s32 %rd127, %r588, 4;
add.s64 %rd126, %rd41, %rd127;
// begin inline asm
ld.volatile.global.v2.s32 {%r591,%r590}, [%rd126];
// end inline asm
$L__BB0_144:
mov.b32 %f340, %r591;
add.f32 %f467, %f467, %f340;
mov.b32 %f341, %r590;
add.f32 %f466, %f466, %f341;
add.s32 %r588, %r588, %r88;
add.s32 %r587, %r587, %r3;
add.s32 %r589, %r589, 1;
setp.lt.s32 %p112, %r589, %r84;
@%p112 bra $L__BB0_141;
$L__BB0_145:
mov.b32 %r406, %f467;
mov.u32 %r407, 31;
mov.u32 %r408, 16;
mov.u32 %r409, -1;
shfl.sync.bfly.b32 %r410|%p113, %r406, %r408, %r407, %r409;
mov.b32 %f342, %r410;
add.f32 %f343, %f467, %f342;
mov.b32 %r411, %f343;
mov.u32 %r412, 8;
shfl.sync.bfly.b32 %r413|%p114, %r411, %r412, %r407, %r409;
mov.b32 %f344, %r413;
add.f32 %f345, %f343, %f344;
mov.b32 %r414, %f345;
mov.u32 %r415, 4;
shfl.sync.bfly.b32 %r416|%p115, %r414, %r415, %r407, %r409;
mov.b32 %f346, %r416;
add.f32 %f347, %f345, %f346;
mov.b32 %r417, %f347;
mov.u32 %r418, 2;
shfl.sync.bfly.b32 %r419|%p116, %r417, %r418, %r407, %r409;
mov.b32 %f348, %r419;
add.f32 %f349, %f347, %f348;
mov.b32 %r420, %f349;
mov.u32 %r421, 1;
shfl.sync.bfly.b32 %r422|%p117, %r420, %r421, %r407, %r409;
mov.b32 %f350, %r422;
add.f32 %f469, %f349, %f350;
bar.sync 0;
setp.ne.s32 %p118, %r82, 0;
@%p118 bra $L__BB0_147;
st.shared.f32 [%rd29], %f469;
$L__BB0_147:
setp.ne.s32 %p119, %r81, 0;
bar.sync 0;
@%p119 bra $L__BB0_151;
setp.ge.u32 %p120, %r82, %r80;
mov.f32 %f468, 0f00000000;
@%p120 bra $L__BB0_150;
ld.shared.f32 %f468, [%rd30];
$L__BB0_150:
mov.b32 %r423, %f468;
mov.u32 %r424, 31;
mov.u32 %r425, 16;
mov.u32 %r426, -1;
shfl.sync.bfly.b32 %r427|%p121, %r423, %r425, %r424, %r426;
mov.b32 %f352, %r427;
add.f32 %f353, %f468, %f352;
mov.b32 %r428, %f353;
mov.u32 %r429, 8;
shfl.sync.bfly.b32 %r430|%p122, %r428, %r429, %r424, %r426;
mov.b32 %f354, %r430;
add.f32 %f355, %f353, %f354;
mov.b32 %r431, %f355;
mov.u32 %r432, 4;
shfl.sync.bfly.b32 %r433|%p123, %r431, %r432, %r424, %r426;
mov.b32 %f356, %r433;
add.f32 %f357, %f355, %f356;
mov.b32 %r434, %f357;
mov.u32 %r435, 2;
shfl.sync.bfly.b32 %r436|%p124, %r434, %r435, %r424, %r426;
mov.b32 %f358, %r436;
add.f32 %f359, %f357, %f358;
mov.b32 %r437, %f359;
mov.u32 %r438, 1;
shfl.sync.bfly.b32 %r439|%p125, %r437, %r438, %r424, %r426;
mov.b32 %f360, %r439;
add.f32 %f469, %f359, %f360;
$L__BB0_151:
add.f32 %f361, %f469, 0f00000000;
mov.b32 %r440, %f361;
setp.eq.s32 %p127, %r82, 0;
selp.b32 %r102, %r440, 0, %p127;
bar.sync 0;
mov.b32 %r441, %f466;
mov.u32 %r442, 31;
mov.u32 %r443, 16;
mov.u32 %r444, -1;
shfl.sync.bfly.b32 %r445|%p128, %r441, %r443, %r442, %r444;
mov.b32 %f362, %r445;
add.f32 %f363, %f466, %f362;
mov.b32 %r446, %f363;
mov.u32 %r447, 8;
shfl.sync.bfly.b32 %r448|%p129, %r446, %r447, %r442, %r444;
mov.b32 %f364, %r448;
add.f32 %f365, %f363, %f364;
mov.b32 %r449, %f365;
mov.u32 %r450, 4;
shfl.sync.bfly.b32 %r451|%p130, %r449, %r450, %r442, %r444;
mov.b32 %f366, %r451;
add.f32 %f367, %f365, %f366;
mov.b32 %r452, %f367;
mov.u32 %r453, 2;
shfl.sync.bfly.b32 %r454|%p131, %r452, %r453, %r442, %r444;
mov.b32 %f368, %r454;
add.f32 %f369, %f367, %f368;
mov.b32 %r455, %f369;
mov.u32 %r456, 1;
shfl.sync.bfly.b32 %r457|%p132, %r455, %r456, %r442, %r444;
mov.b32 %f370, %r457;
add.f32 %f471, %f369, %f370;
bar.sync 0;
@%p118 bra $L__BB0_153;
st.shared.f32 [%rd29], %f471;
$L__BB0_153:
bar.sync 0;
@%p119 bra $L__BB0_157;
setp.ge.u32 %p134, %r82, %r80;
mov.f32 %f470, 0f00000000;
@%p134 bra $L__BB0_156;
ld.shared.f32 %f470, [%rd30];
$L__BB0_156:
mov.b32 %r458, %f470;
mov.u32 %r459, 31;
mov.u32 %r460, 16;
mov.u32 %r461, -1;
shfl.sync.bfly.b32 %r462|%p135, %r458, %r460, %r459, %r461;
mov.b32 %f372, %r462;
add.f32 %f373, %f470, %f372;
mov.b32 %r463, %f373;
mov.u32 %r464, 8;
shfl.sync.bfly.b32 %r465|%p136, %r463, %r464, %r459, %r461;
mov.b32 %f374, %r465;
add.f32 %f375, %f373, %f374;
mov.b32 %r466, %f375;
mov.u32 %r467, 4;
shfl.sync.bfly.b32 %r468|%p137, %r466, %r467, %r459, %r461;
mov.b32 %f376, %r468;
add.f32 %f377, %f375, %f376;
mov.b32 %r469, %f377;
mov.u32 %r470, 2;
shfl.sync.bfly.b32 %r471|%p138, %r469, %r470, %r459, %r461;
mov.b32 %f378, %r471;
add.f32 %f379, %f377, %f378;
mov.b32 %r472, %f379;
mov.u32 %r473, 1;
shfl.sync.bfly.b32 %r474|%p139, %r472, %r473, %r459, %r461;
mov.b32 %f380, %r474;
add.f32 %f471, %f379, %f380;
$L__BB0_157:
bar.sync 0;
setp.ne.s32 %p140, %r6, 0;
@%p140 bra $L__BB0_160;
mul.lo.s32 %r103, %r78, %r586;
add.s32 %r475, %r77, %r103;
setp.ge.s32 %p141, %r475, %r155;
@%p141 bra $L__BB0_160;
ld.param.u64 %rd134, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_19_cu_5c77fe0a_723310nvfuser_19ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r478, %r79, %r103;
mul.wide.s32 %rd129, %r478, 4;
add.s64 %rd128, %rd134, %rd129;
add.f32 %f381, %f471, 0f00000000;
mov.b32 %r479, %f381;
selp.b32 %r477, %r479, 0, %p127;
// begin inline asm
st.global.cs.v2.s32 [%rd128], {%r102,%r477};
// end inline asm
$L__BB0_160:
add.s32 %r586, %r586, 1;
setp.lt.s32 %p143, %r586, %r74;
@%p143 bra $L__BB0_139;
$L__BB0_161:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -30,11 +30,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<184>;
.reg .f32 %f<480>;
- .reg .b32 %r<597>;
+ .reg .b32 %r<595>;
.reg .f64 %fd<3>;
.reg .b64 %rd<137>;
ld.param.v2.u32 {%r154, %r155}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
@@ -50,136 +50,136 @@
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r190, %r155, 3;
shr.s32 %r191, %r190, 31;
shr.u32 %r192, %r191, 30;
add.s32 %r193, %r190, %r192;
- shr.s32 %r194, %r193, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r195, %r194, %r2;
- add.s32 %r196, %r195, 31;
- shr.s32 %r197, %r196, 31;
- shr.u32 %r198, %r197, 27;
- add.s32 %r199, %r196, %r198;
- shr.u32 %r200, %r199, 5;
- mov.u32 %r3, %ntid.y;
- mul.lo.s32 %r201, %r3, %r200;
- shl.b32 %r202, %r201, 7;
- cvt.u64.u32 %rd1, %r202;
- mul.lo.s32 %r203, %r3, %r194;
- shl.b32 %r204, %r203, 4;
- or.b32 %r205, %r204, 15;
- and.b32 %r4, %r205, -16;
- add.s32 %r206, %r205, %r4;
- and.b32 %r207, %r206, -16;
- cvt.s64.s32 %rd2, %r207;
+ shr.s32 %r2, %r193, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r194, %r2, %r3;
+ add.s32 %r195, %r194, 31;
+ shr.s32 %r196, %r195, 31;
+ shr.u32 %r197, %r196, 27;
+ add.s32 %r198, %r195, %r197;
+ shr.u32 %r199, %r198, 5;
+ mov.u32 %r4, %ntid.y;
+ mul.lo.s32 %r200, %r4, %r199;
+ shl.b32 %r201, %r200, 7;
+ cvt.u64.u32 %rd1, %r201;
+ mul.lo.s32 %r202, %r4, %r2;
+ shl.b32 %r203, %r202, 4;
+ or.b32 %r204, %r203, 15;
+ and.b32 %r5, %r204, -16;
+ add.s32 %r205, %r204, %r5;
+ and.b32 %r206, %r205, -16;
+ cvt.s64.s32 %rd2, %r206;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p6, %r5, %r194;
- shl.b32 %r6, %r5, 2;
- or.b32 %r208, %r6, 3;
- setp.lt.s32 %p7, %r208, %r155;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p6, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r207, %r7, 3;
+ setp.lt.s32 %p7, %r207, %r155;
and.pred %p1, %p7, %p6;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p8, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p8, %r8, 0;
and.pred %p2, %p8, %p1;
not.pred %p9, %p2;
@%p9 bra $L__BB0_2;
add.s64 %rd46, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r209, smem_ptr; }
-
-
- shl.b32 %r212, %r5, 4;
- add.s32 %r210, %r209, %r212;
- mul.wide.s32 %rd48, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r208, smem_ptr; }
+
+
+ shl.b32 %r211, %r6, 4;
+ add.s32 %r209, %r208, %r211;
+ mul.wide.s32 %rd48, %r7, 4;
add.s64 %rd47, %rd37, %rd48;
- mov.u32 %r211, 0;
+ mov.u32 %r210, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r211, 0;
- cp.async.ca.shared.global [%r210], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r210, 0;
+ cp.async.ca.shared.global [%r209], [%rd47], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r213, %r3, 215;
- div.s32 %r214, %r213, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r215, %r8, %r214;
- add.s32 %r216, %r215, -1;
- div.s32 %r9, %r216, %r8;
- setp.gt.s32 %p10, %r9, 0;
+ add.s32 %r212, %r4, 215;
+ div.s32 %r213, %r212, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r214, %r9, %r213;
+ add.s32 %r215, %r214, -1;
+ div.s32 %r10, %r215, %r9;
+ setp.gt.s32 %p10, %r10, 0;
add.s64 %rd4, %rd1, %rd2;
@%p10 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r155;
- cvt.s64.s32 %rd49, %r4;
+ cvt.s64.s32 %rd49, %r5;
add.s64 %rd50, %rd1, %rd49;
add.s64 %rd52, %rd44, %rd1;
- mov.u32 %r218, %ctaid.y;
- mul.lo.s32 %r219, %r9, %r3;
- mul.lo.s32 %r10, %r219, %r218;
- shl.b32 %r220, %r7, 2;
- shl.b32 %r221, %r5, 4;
- mad.lo.s32 %r11, %r220, %r155, %r221;
- mul.lo.s32 %r222, %r155, %r7;
- cvt.s64.s32 %rd53, %r222;
- cvt.s64.s32 %rd54, %r6;
+ mov.u32 %r217, %ctaid.y;
+ mul.lo.s32 %r218, %r10, %r4;
+ mul.lo.s32 %r11, %r218, %r217;
+ mad.lo.s32 %r219, %r2, %r8, %r6;
+ shl.b32 %r12, %r219, 4;
+ mul.lo.s32 %r220, %r155, %r8;
+ cvt.s64.s32 %rd53, %r220;
+ cvt.s64.s32 %rd54, %r7;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r223, %r10, %r155;
- cvt.s64.s32 %rd6, %r223;
- mul.lo.s32 %r12, %r155, %r3;
- mul.lo.s32 %r13, %r9, %r218;
- add.s32 %r14, %r222, %r6;
+ mul.lo.s32 %r221, %r11, %r155;
+ cvt.s64.s32 %rd6, %r221;
+ mul.lo.s32 %r13, %r155, %r4;
+ mul.lo.s32 %r14, %r10, %r217;
+ shl.b32 %r222, %r8, 2;
+ mad.lo.s32 %r223, %r222, %r2, %r7;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r14, 4;
+ mul.wide.s32 %rd56, %r223, 4;
add.s64 %rd7, %rd55, %rd56;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r224, %tid.z;
- mad.lo.s32 %r225, %r224, %r3, %r7;
- shr.u32 %r15, %r2, 5;
+ mad.lo.s32 %r225, %r224, %r4, %r8;
+ shr.u32 %r15, %r3, 5;
mul.lo.s32 %r226, %r225, %r15;
- shr.u32 %r16, %r5, 5;
+ shr.u32 %r16, %r6, 5;
add.s32 %r227, %r226, %r16;
mul.wide.u32 %rd57, %r227, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd9, %rd52, %rd56;
- and.b32 %r17, %r5, 31;
+ and.b32 %r17, %r6, 31;
add.s32 %r228, %r226, %r17;
mul.wide.u32 %rd58, %r228, 4;
add.s64 %rd10, %rd44, %rd58;
add.s64 %rd59, %rd44, %rd4;
- mul.wide.s32 %rd60, %r6, 4;
+ mul.wide.s32 %rd60, %r7, 4;
add.s64 %rd11, %rd59, %rd60;
mul.wide.s32 %rd61, %r225, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd36;
add.s64 %rd18, %rd45, %rd50;
- mov.u32 %r565, 0;
+ mov.u32 %r563, 0;
mov.f32 %f438, 0f00000000;
not.pred %p11, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd18; cvt.u32.u64 %r231, smem_ptr; }
- add.s32 %r232, %r11, %r231;
+ add.s32 %r232, %r231, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r257, smem_ptr; }
- add.s32 %r258, %r11, %r257;
+ add.s32 %r258, %r257, %r12;
mov.f32 %f439, %f438;
mov.f32 %f440, %f438;
mov.f32 %f441, %f438;
mov.f32 %f452, %f438;
mov.f32 %f453, %f438;
@@ -188,16 +188,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p11 bra $L__BB0_8;
- mad.lo.s32 %r229, %r565, %r3, %r7;
- add.s32 %r230, %r229, %r10;
+ mad.lo.s32 %r229, %r563, %r4, %r8;
+ add.s32 %r230, %r229, %r11;
setp.gt.s32 %p12, %r230, 215;
@%p12 bra $L__BB0_8;
- mul.lo.s32 %r234, %r12, %r565;
+ mul.lo.s32 %r234, %r13, %r563;
cvt.s64.s32 %rd65, %r234;
add.s64 %rd66, %rd5, %rd65;
add.s64 %rd67, %rd66, %rd6;
shl.b64 %rd68, %rd67, 2;
add.s64 %rd64, %rd33, %rd68;
@@ -216,53 +216,53 @@
cp.async.wait_all;
@%p11 bra $L__BB0_10;
- add.s32 %r235, %r13, %r565;
- mad.lo.s32 %r236, %r235, %r3, %r7;
+ add.s32 %r235, %r14, %r563;
+ mad.lo.s32 %r236, %r235, %r4, %r8;
setp.lt.s32 %p14, %r236, 216;
@%p14 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
+ ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r245, %r13, %r565;
- mad.lo.s32 %r246, %r245, %r3, %r7;
+ add.s32 %r245, %r14, %r563;
+ mad.lo.s32 %r246, %r245, %r4, %r8;
setp.gt.s32 %p15, %r246, 215;
- mov.u32 %r566, 0;
- mov.u32 %r567, %r566;
- mov.u32 %r568, %r566;
- mov.u32 %r569, %r566;
+ mov.u32 %r564, 0;
+ mov.u32 %r565, %r564;
+ mov.u32 %r566, %r564;
+ mov.u32 %r567, %r564;
@%p15 bra $L__BB0_15;
- ld.shared.v4.u32 {%r566, %r567, %r568, %r569}, [%rd7];
+ ld.shared.v4.u32 {%r564, %r565, %r566, %r567}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r566, 0;
- mov.u32 %r567, %r566;
- mov.u32 %r568, %r566;
- mov.u32 %r569, %r566;
+ mov.u32 %r564, 0;
+ mov.u32 %r565, %r564;
+ mov.u32 %r566, %r564;
+ mov.u32 %r567, %r564;
$L__BB0_15:
- add.s32 %r255, %r13, %r565;
- mad.lo.s32 %r33, %r255, %r3, %r7;
- mov.b32 %f125, %r569;
+ add.s32 %r255, %r14, %r563;
+ mad.lo.s32 %r33, %r255, %r4, %r8;
+ mov.b32 %f125, %r567;
add.f32 %f455, %f455, %f125;
- mov.b32 %f126, %r568;
+ mov.b32 %f126, %r566;
add.f32 %f454, %f454, %f126;
- mov.b32 %f127, %r567;
+ mov.b32 %f127, %r565;
add.f32 %f453, %f453, %f127;
- mov.b32 %f128, %r566;
+ mov.b32 %f128, %r564;
add.f32 %f452, %f452, %f128;
setp.gt.s32 %p16, %r33, 215;
mov.f32 %f436, 0f00000000;
@%p16 bra $L__BB0_17;
@@ -275,11 +275,11 @@
setp.lt.s32 %p17, %r33, 216;
and.pred %p3, %p1, %p17;
not.pred %p18, %p3;
@%p18 bra $L__BB0_19;
- mul.lo.s32 %r260, %r12, %r565;
+ mul.lo.s32 %r260, %r13, %r563;
cvt.s64.s32 %rd73, %r260;
add.s64 %rd74, %rd5, %rd73;
add.s64 %rd75, %rd74, %rd6;
shl.b64 %rd76, %rd75, 2;
add.s64 %rd72, %rd34, %rd76;
@@ -412,11 +412,11 @@
shfl.sync.bfly.b32 %r295|%p33, %r293, %r294, %r280, %r282;
mov.b32 %f192, %r295;
add.f32 %f445, %f191, %f192;
$L__BB0_29:
- setp.ne.s32 %p180, %r17, 0;
+ setp.ne.s32 %p181, %r17, 0;
bar.sync 0;
mov.b32 %r296, %f442;
mov.u32 %r297, 31;
mov.u32 %r298, 16;
mov.u32 %r299, -1;
@@ -442,21 +442,21 @@
mov.u32 %r311, 1;
shfl.sync.bfly.b32 %r312|%p38, %r310, %r311, %r297, %r299;
mov.b32 %f201, %r312;
add.f32 %f447, %f200, %f201;
bar.sync 0;
- @%p180 bra $L__BB0_31;
+ @%p181 bra $L__BB0_31;
st.shared.f32 [%rd8], %f447;
$L__BB0_31:
setp.eq.s32 %p183, %r17, 0;
- setp.ne.s32 %p181, %r16, 0;
+ setp.ne.s32 %p178, %r16, 0;
bar.sync 0;
add.f32 %f202, %f445, 0f00000000;
selp.f32 %f37, %f202, 0f00000000, %p183;
- @%p181 bra $L__BB0_35;
+ @%p178 bra $L__BB0_35;
setp.ge.u32 %p41, %r17, %r15;
mov.f32 %f446, 0f00000000;
@%p41 bra $L__BB0_34;
@@ -491,11 +491,11 @@
mov.b32 %f212, %r329;
add.f32 %f447, %f211, %f212;
$L__BB0_35:
bar.sync 0;
- setp.ne.s32 %p47, %r5, 0;
+ setp.ne.s32 %p47, %r6, 0;
@%p47 bra $L__BB0_37;
st.shared.f32 [%rd12], %f37;
$L__BB0_37:
@@ -553,21 +553,20 @@
sub.f32 %f268, %f264, %f42;
mul.f32 %f269, %f43, %f267;
sub.f32 %f270, %f268, %f269;
mul.f32 %f271, %f215, %f270;
mov.b32 %r333, %f271;
- mad.lo.s32 %r334, %r565, %r3, %r10;
- mad.lo.s32 %r335, %r334, %r155, %r14;
- mul.wide.s32 %rd80, %r335, 4;
+ mad.lo.s32 %r334, %r33, %r155, %r7;
+ mul.wide.s32 %rd80, %r334, 4;
add.s64 %rd79, %rd38, %rd80;
st.global.cs.v4.s32 [%rd79], {%r330,%r331,%r332,%r333};
$L__BB0_41:
- add.s32 %r565, %r565, 1;
- setp.lt.s32 %p51, %r565, %r9;
+ add.s32 %r563, %r563, 1;
+ setp.lt.s32 %p51, %r563, %r10;
@%p51 bra $L__BB0_5;
bra.uni $L__BB0_42;
$L__BB0_3:
mov.f32 %f438, 0f00000000;
@@ -578,31 +577,31 @@
mov.f32 %f453, %f438;
mov.f32 %f454, %f438;
mov.f32 %f455, %f438;
$L__BB0_42:
- mov.u32 %r336, %tid.z;
- mad.lo.s32 %r35, %r336, %r3, %r7;
- mad.lo.s32 %r36, %r35, %r2, %r5;
+ mov.u32 %r335, %tid.z;
+ mad.lo.s32 %r35, %r335, %r4, %r8;
+ mad.lo.s32 %r36, %r35, %r3, %r6;
mul.wide.u32 %rd81, %r36, 4;
add.s64 %rd22, %rd44, %rd81;
- clz.b32 %r337, %r3;
- mov.u32 %r338, 31;
- sub.s32 %r339, %r338, %r337;
- mov.u32 %r340, 1;
- shl.b32 %r37, %r340, %r339;
- setp.lt.u32 %p52, %r7, %r37;
- add.s32 %r341, %r37, %r7;
- setp.lt.u32 %p53, %r341, %r3;
+ clz.b32 %r336, %r4;
+ mov.u32 %r337, 31;
+ sub.s32 %r338, %r337, %r336;
+ mov.u32 %r339, 1;
+ shl.b32 %r37, %r339, %r338;
+ setp.lt.u32 %p52, %r8, %r37;
+ add.s32 %r340, %r37, %r8;
+ setp.lt.u32 %p53, %r340, %r4;
and.pred %p5, %p52, %p53;
- shl.b32 %r342, %r2, %r339;
- add.s32 %r343, %r36, %r342;
- mul.wide.s32 %rd83, %r343, 4;
+ shl.b32 %r341, %r3, %r338;
+ add.s32 %r342, %r36, %r341;
+ mul.wide.s32 %rd83, %r342, 4;
add.s64 %rd23, %rd44, %rd83;
- shr.u32 %r344, %r37, 31;
- add.s32 %r345, %r37, %r344;
- shr.s32 %r584, %r345, 1;
+ shr.u32 %r343, %r37, 31;
+ add.s32 %r344, %r37, %r343;
+ shr.s32 %r582, %r344, 1;
st.shared.f32 [%rd22], %f438;
bar.sync 0;
not.pred %p54, %p5;
@%p54 bra $L__BB0_44;
@@ -614,49 +613,49 @@
$L__BB0_44:
setp.lt.s32 %p55, %r37, 4;
bar.sync 0;
@%p55 bra $L__BB0_49;
- mov.u32 %r570, %r584;
+ mov.u32 %r568, %r582;
$L__BB0_46:
- setp.ge.u32 %p56, %r7, %r570;
+ setp.ge.u32 %p56, %r8, %r568;
@%p56 bra $L__BB0_48;
- mad.lo.s32 %r346, %r570, %r2, %r36;
- mul.wide.s32 %rd84, %r346, 4;
+ mad.lo.s32 %r345, %r568, %r3, %r36;
+ mul.wide.s32 %rd84, %r345, 4;
add.s64 %rd86, %rd44, %rd84;
ld.shared.f32 %f275, [%rd22];
ld.shared.f32 %f276, [%rd86];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd22], %f277;
$L__BB0_48:
bar.sync 0;
- shr.u32 %r40, %r570, 1;
- setp.gt.u32 %p57, %r570, 3;
- mov.u32 %r570, %r40;
+ shr.u32 %r40, %r568, 1;
+ setp.gt.u32 %p57, %r568, 3;
+ mov.u32 %r568, %r40;
@%p57 bra $L__BB0_46;
$L__BB0_49:
- add.s32 %r348, %r36, %r2;
- mul.wide.u32 %rd87, %r348, 4;
+ add.s32 %r347, %r36, %r3;
+ mul.wide.u32 %rd87, %r347, 4;
add.s64 %rd24, %rd44, %rd87;
- setp.ne.s32 %p58, %r7, 0;
- mov.u32 %r571, 0;
+ setp.ne.s32 %p58, %r8, 0;
+ mov.u32 %r569, 0;
@%p58 bra $L__BB0_53;
- setp.lt.u32 %p59, %r3, 2;
+ setp.lt.u32 %p59, %r4, 2;
ld.shared.f32 %f278, [%rd22];
add.f32 %f456, %f278, 0f00000000;
@%p59 bra $L__BB0_52;
ld.shared.f32 %f279, [%rd24];
add.f32 %f456, %f456, %f279;
$L__BB0_52:
- mov.b32 %r571, %f456;
+ mov.b32 %r569, %f456;
$L__BB0_53:
bar.sync 0;
st.shared.f32 [%rd22], %f439;
bar.sync 0;
@@ -669,45 +668,45 @@
$L__BB0_55:
bar.sync 0;
@%p55 bra $L__BB0_60;
- mov.u32 %r572, %r584;
+ mov.u32 %r570, %r582;
$L__BB0_57:
- setp.ge.u32 %p62, %r7, %r572;
+ setp.ge.u32 %p62, %r8, %r570;
@%p62 bra $L__BB0_59;
- mad.lo.s32 %r349, %r572, %r2, %r36;
- mul.wide.s32 %rd89, %r349, 4;
+ mad.lo.s32 %r348, %r570, %r3, %r36;
+ mul.wide.s32 %rd89, %r348, 4;
add.s64 %rd91, %rd44, %rd89;
ld.shared.f32 %f283, [%rd22];
ld.shared.f32 %f284, [%rd91];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd22], %f285;
$L__BB0_59:
bar.sync 0;
- shr.u32 %r44, %r572, 1;
- setp.gt.u32 %p63, %r572, 3;
- mov.u32 %r572, %r44;
+ shr.u32 %r44, %r570, 1;
+ setp.gt.u32 %p63, %r570, 3;
+ mov.u32 %r570, %r44;
@%p63 bra $L__BB0_57;
$L__BB0_60:
- mov.u32 %r573, 0;
+ mov.u32 %r571, 0;
@%p58 bra $L__BB0_64;
- setp.lt.u32 %p65, %r3, 2;
+ setp.lt.u32 %p65, %r4, 2;
ld.shared.f32 %f286, [%rd22];
add.f32 %f457, %f286, 0f00000000;
@%p65 bra $L__BB0_63;
ld.shared.f32 %f287, [%rd24];
add.f32 %f457, %f457, %f287;
$L__BB0_63:
- mov.b32 %r573, %f457;
+ mov.b32 %r571, %f457;
$L__BB0_64:
bar.sync 0;
st.shared.f32 [%rd22], %f440;
bar.sync 0;
@@ -720,45 +719,45 @@
$L__BB0_66:
bar.sync 0;
@%p55 bra $L__BB0_71;
- mov.u32 %r574, %r584;
+ mov.u32 %r572, %r582;
$L__BB0_68:
- setp.ge.u32 %p68, %r7, %r574;
+ setp.ge.u32 %p68, %r8, %r572;
@%p68 bra $L__BB0_70;
- mad.lo.s32 %r351, %r574, %r2, %r36;
- mul.wide.s32 %rd92, %r351, 4;
+ mad.lo.s32 %r350, %r572, %r3, %r36;
+ mul.wide.s32 %rd92, %r350, 4;
add.s64 %rd94, %rd44, %rd92;
ld.shared.f32 %f291, [%rd22];
ld.shared.f32 %f292, [%rd94];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd22], %f293;
$L__BB0_70:
bar.sync 0;
- shr.u32 %r48, %r574, 1;
- setp.gt.u32 %p69, %r574, 3;
- mov.u32 %r574, %r48;
+ shr.u32 %r48, %r572, 1;
+ setp.gt.u32 %p69, %r572, 3;
+ mov.u32 %r572, %r48;
@%p69 bra $L__BB0_68;
$L__BB0_71:
- mov.u32 %r575, 0;
+ mov.u32 %r573, 0;
@%p58 bra $L__BB0_75;
- setp.lt.u32 %p71, %r3, 2;
+ setp.lt.u32 %p71, %r4, 2;
ld.shared.f32 %f294, [%rd22];
add.f32 %f458, %f294, 0f00000000;
@%p71 bra $L__BB0_74;
ld.shared.f32 %f295, [%rd24];
add.f32 %f458, %f458, %f295;
$L__BB0_74:
- mov.b32 %r575, %f458;
+ mov.b32 %r573, %f458;
$L__BB0_75:
bar.sync 0;
st.shared.f32 [%rd22], %f441;
bar.sync 0;
@@ -771,45 +770,45 @@
$L__BB0_77:
bar.sync 0;
@%p55 bra $L__BB0_82;
- mov.u32 %r576, %r584;
+ mov.u32 %r574, %r582;
$L__BB0_79:
- setp.ge.u32 %p74, %r7, %r576;
+ setp.ge.u32 %p74, %r8, %r574;
@%p74 bra $L__BB0_81;
- mad.lo.s32 %r353, %r576, %r2, %r36;
- mul.wide.s32 %rd95, %r353, 4;
+ mad.lo.s32 %r352, %r574, %r3, %r36;
+ mul.wide.s32 %rd95, %r352, 4;
add.s64 %rd97, %rd44, %rd95;
ld.shared.f32 %f299, [%rd22];
ld.shared.f32 %f300, [%rd97];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd22], %f301;
$L__BB0_81:
bar.sync 0;
- shr.u32 %r52, %r576, 1;
- setp.gt.u32 %p75, %r576, 3;
- mov.u32 %r576, %r52;
+ shr.u32 %r52, %r574, 1;
+ setp.gt.u32 %p75, %r574, 3;
+ mov.u32 %r574, %r52;
@%p75 bra $L__BB0_79;
$L__BB0_82:
- mov.u32 %r577, 0;
+ mov.u32 %r575, 0;
@%p58 bra $L__BB0_86;
- setp.lt.u32 %p77, %r3, 2;
+ setp.lt.u32 %p77, %r4, 2;
ld.shared.f32 %f302, [%rd22];
add.f32 %f459, %f302, 0f00000000;
@%p77 bra $L__BB0_85;
ld.shared.f32 %f303, [%rd24];
add.f32 %f459, %f459, %f303;
$L__BB0_85:
- mov.b32 %r577, %f459;
+ mov.b32 %r575, %f459;
$L__BB0_86:
bar.sync 0;
st.shared.f32 [%rd22], %f452;
bar.sync 0;
@@ -822,45 +821,45 @@
$L__BB0_88:
bar.sync 0;
@%p55 bra $L__BB0_93;
- mov.u32 %r578, %r584;
+ mov.u32 %r576, %r582;
$L__BB0_90:
- setp.ge.u32 %p80, %r7, %r578;
+ setp.ge.u32 %p80, %r8, %r576;
@%p80 bra $L__BB0_92;
- mad.lo.s32 %r355, %r578, %r2, %r36;
- mul.wide.s32 %rd98, %r355, 4;
+ mad.lo.s32 %r354, %r576, %r3, %r36;
+ mul.wide.s32 %rd98, %r354, 4;
add.s64 %rd100, %rd44, %rd98;
ld.shared.f32 %f307, [%rd22];
ld.shared.f32 %f308, [%rd100];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd22], %f309;
$L__BB0_92:
bar.sync 0;
- shr.u32 %r56, %r578, 1;
- setp.gt.u32 %p81, %r578, 3;
- mov.u32 %r578, %r56;
+ shr.u32 %r56, %r576, 1;
+ setp.gt.u32 %p81, %r576, 3;
+ mov.u32 %r576, %r56;
@%p81 bra $L__BB0_90;
$L__BB0_93:
- mov.u32 %r579, 0;
+ mov.u32 %r577, 0;
@%p58 bra $L__BB0_97;
- setp.lt.u32 %p83, %r3, 2;
+ setp.lt.u32 %p83, %r4, 2;
ld.shared.f32 %f310, [%rd22];
add.f32 %f460, %f310, 0f00000000;
@%p83 bra $L__BB0_96;
ld.shared.f32 %f311, [%rd24];
add.f32 %f460, %f460, %f311;
$L__BB0_96:
- mov.b32 %r579, %f460;
+ mov.b32 %r577, %f460;
$L__BB0_97:
bar.sync 0;
st.shared.f32 [%rd22], %f453;
bar.sync 0;
@@ -873,45 +872,45 @@
$L__BB0_99:
bar.sync 0;
@%p55 bra $L__BB0_104;
- mov.u32 %r580, %r584;
+ mov.u32 %r578, %r582;
$L__BB0_101:
- setp.ge.u32 %p86, %r7, %r580;
+ setp.ge.u32 %p86, %r8, %r578;
@%p86 bra $L__BB0_103;
- mad.lo.s32 %r357, %r580, %r2, %r36;
- mul.wide.s32 %rd101, %r357, 4;
+ mad.lo.s32 %r356, %r578, %r3, %r36;
+ mul.wide.s32 %rd101, %r356, 4;
add.s64 %rd103, %rd44, %rd101;
ld.shared.f32 %f315, [%rd22];
ld.shared.f32 %f316, [%rd103];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd22], %f317;
$L__BB0_103:
bar.sync 0;
- shr.u32 %r60, %r580, 1;
- setp.gt.u32 %p87, %r580, 3;
- mov.u32 %r580, %r60;
+ shr.u32 %r60, %r578, 1;
+ setp.gt.u32 %p87, %r578, 3;
+ mov.u32 %r578, %r60;
@%p87 bra $L__BB0_101;
$L__BB0_104:
- mov.u32 %r581, 0;
+ mov.u32 %r579, 0;
@%p58 bra $L__BB0_108;
- setp.lt.u32 %p89, %r3, 2;
+ setp.lt.u32 %p89, %r4, 2;
ld.shared.f32 %f318, [%rd22];
add.f32 %f461, %f318, 0f00000000;
@%p89 bra $L__BB0_107;
ld.shared.f32 %f319, [%rd24];
add.f32 %f461, %f461, %f319;
$L__BB0_107:
- mov.b32 %r581, %f461;
+ mov.b32 %r579, %f461;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd22], %f454;
bar.sync 0;
@@ -924,45 +923,45 @@
$L__BB0_110:
bar.sync 0;
@%p55 bra $L__BB0_115;
- mov.u32 %r582, %r584;
+ mov.u32 %r580, %r582;
$L__BB0_112:
- setp.ge.u32 %p92, %r7, %r582;
+ setp.ge.u32 %p92, %r8, %r580;
@%p92 bra $L__BB0_114;
- mad.lo.s32 %r359, %r582, %r2, %r36;
- mul.wide.s32 %rd104, %r359, 4;
+ mad.lo.s32 %r358, %r580, %r3, %r36;
+ mul.wide.s32 %rd104, %r358, 4;
add.s64 %rd106, %rd44, %rd104;
ld.shared.f32 %f323, [%rd22];
ld.shared.f32 %f324, [%rd106];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd22], %f325;
$L__BB0_114:
bar.sync 0;
- shr.u32 %r64, %r582, 1;
- setp.gt.u32 %p93, %r582, 3;
- mov.u32 %r582, %r64;
+ shr.u32 %r64, %r580, 1;
+ setp.gt.u32 %p93, %r580, 3;
+ mov.u32 %r580, %r64;
@%p93 bra $L__BB0_112;
$L__BB0_115:
- mov.u32 %r583, 0;
+ mov.u32 %r581, 0;
@%p58 bra $L__BB0_119;
- setp.lt.u32 %p95, %r3, 2;
+ setp.lt.u32 %p95, %r4, 2;
ld.shared.f32 %f326, [%rd22];
add.f32 %f462, %f326, 0f00000000;
@%p95 bra $L__BB0_118;
ld.shared.f32 %f327, [%rd24];
add.f32 %f462, %f462, %f327;
$L__BB0_118:
- mov.b32 %r583, %f462;
+ mov.b32 %r581, %f462;
$L__BB0_119:
bar.sync 0;
st.shared.f32 [%rd22], %f455;
bar.sync 0;
@@ -976,217 +975,216 @@
$L__BB0_121:
bar.sync 0;
@%p55 bra $L__BB0_125;
$L__BB0_122:
- setp.ge.u32 %p98, %r7, %r584;
+ setp.ge.u32 %p98, %r8, %r582;
@%p98 bra $L__BB0_124;
- mad.lo.s32 %r361, %r584, %r2, %r36;
- mul.wide.s32 %rd107, %r361, 4;
+ mad.lo.s32 %r360, %r582, %r3, %r36;
+ mul.wide.s32 %rd107, %r360, 4;
add.s64 %rd109, %rd44, %rd107;
ld.shared.f32 %f331, [%rd22];
ld.shared.f32 %f332, [%rd109];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd22], %f333;
$L__BB0_124:
bar.sync 0;
- shr.u32 %r68, %r584, 1;
- setp.gt.u32 %p99, %r584, 3;
- mov.u32 %r584, %r68;
+ shr.u32 %r68, %r582, 1;
+ setp.gt.u32 %p99, %r582, 3;
+ mov.u32 %r582, %r68;
@%p99 bra $L__BB0_122;
$L__BB0_125:
- mov.u32 %r585, 0;
+ mov.u32 %r583, 0;
@%p58 bra $L__BB0_129;
- setp.lt.u32 %p101, %r3, 2;
+ setp.lt.u32 %p101, %r4, 2;
ld.shared.f32 %f334, [%rd22];
add.f32 %f463, %f334, 0f00000000;
@%p101 bra $L__BB0_128;
ld.shared.f32 %f335, [%rd24];
add.f32 %f463, %f463, %f335;
$L__BB0_128:
- mov.b32 %r585, %f463;
+ mov.b32 %r583, %f463;
$L__BB0_129:
- setp.eq.s32 %p179, %r7, 0;
- and.pred %p178, %p179, %p1;
- bar.sync 0;
- @%p178 bra $L__BB0_130;
+ setp.eq.s32 %p180, %r8, 0;
+ and.pred %p179, %p180, %p1;
+ bar.sync 0;
+ @%p179 bra $L__BB0_130;
bra.uni $L__BB0_131;
$L__BB0_130:
- shl.b32 %r564, %r5, 2;
- mov.u32 %r371, %ctaid.y;
- mad.lo.s32 %r372, %r155, %r371, %r564;
- mul.wide.s32 %rd112, %r372, 4;
+ mov.u32 %r370, %ctaid.y;
+ mad.lo.s32 %r371, %r155, %r370, %r7;
+ mul.wide.s32 %rd112, %r371, 4;
add.s64 %rd110, %rd41, %rd112;
- st.volatile.global.v4.s32 [%rd110], {%r571,%r573,%r575,%r577};
+ st.volatile.global.v4.s32 [%rd110], {%r569,%r571,%r573,%r575};
add.s64 %rd111, %rd42, %rd112;
- st.volatile.global.v4.s32 [%rd111], {%r579,%r581,%r583,%r585};
+ st.volatile.global.v4.s32 [%rd111], {%r577,%r579,%r581,%r583};
$L__BB0_131:
mov.u32 %r71, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r373, %r5, %r7;
- or.b32 %r375, %r373, %r336;
- setp.ne.s32 %p102, %r375, 0;
+ or.b32 %r372, %r6, %r8;
+ or.b32 %r374, %r372, %r335;
+ setp.ne.s32 %p102, %r374, 0;
@%p102 bra $L__BB0_135;
ld.param.u64 %rd136, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd113, %rd136;
- mov.u32 %r376, %ctaid.x;
- mov.u32 %r377, %ctaid.z;
- mov.u32 %r378, %nctaid.x;
- mad.lo.s32 %r379, %r377, %r378, %r376;
- mul.wide.s32 %rd114, %r379, 8;
+ mov.u32 %r375, %ctaid.x;
+ mov.u32 %r376, %ctaid.z;
+ mov.u32 %r377, %nctaid.x;
+ mad.lo.s32 %r378, %r376, %r377, %r375;
+ mul.wide.s32 %rd114, %r378, 8;
add.s64 %rd27, %rd113, %rd114;
- add.s32 %r380, %r8, -1;
- setp.eq.s32 %p103, %r71, %r380;
- cvt.s64.s32 %rd115, %r8;
+ add.s32 %r379, %r9, -1;
+ setp.eq.s32 %p103, %r71, %r379;
+ cvt.s64.s32 %rd115, %r9;
mov.u64 %rd116, -9223372036854775807;
sub.s64 %rd117, %rd116, %rd115;
selp.b64 %rd118, %rd117, 1, %p103;
atom.global.add.u64 %rd28, [%rd27], %rd118;
ld.volatile.global.u64 %rd119, [%rd27];
xor.b64 %rd120, %rd119, %rd28;
setp.lt.s64 %p104, %rd120, 0;
@%p104 bra $L__BB0_135;
- mov.u32 %r586, 8;
+ mov.u32 %r584, 8;
$L__BB0_134:
- nanosleep.u32 %r586;
-
- setp.lt.u32 %p105, %r586, 256;
- selp.u32 %r383, 1, 0, %p105;
- shl.b32 %r586, %r586, %r383;
+ nanosleep.u32 %r584;
+
+ setp.lt.u32 %p105, %r584, 256;
+ selp.u32 %r382, 1, 0, %p105;
+ shl.b32 %r584, %r584, %r382;
ld.volatile.global.u64 %rd121, [%rd27];
xor.b64 %rd122, %rd121, %rd28;
setp.gt.s64 %p106, %rd122, -1;
@%p106 bra $L__BB0_134;
$L__BB0_135:
bar.sync 0;
- add.s32 %r385, %r155, 1;
- shr.u32 %r386, %r385, 31;
- add.s32 %r387, %r385, %r386;
- shr.s32 %r388, %r387, 1;
- add.s32 %r389, %r3, %r388;
- add.s32 %r390, %r389, -1;
- div.s32 %r391, %r390, %r3;
- add.s32 %r392, %r8, -1;
- add.s32 %r393, %r392, %r391;
- div.s32 %r74, %r393, %r8;
- add.s32 %r75, %r392, %r2;
- shl.b32 %r76, %r7, 1;
- shl.b32 %r394, %r3, 1;
- mad.lo.s32 %r79, %r394, %r71, %r76;
+ add.s32 %r384, %r155, 1;
+ shr.u32 %r385, %r384, 31;
+ add.s32 %r386, %r384, %r385;
+ shr.s32 %r387, %r386, 1;
+ add.s32 %r388, %r4, %r387;
+ add.s32 %r389, %r388, -1;
+ div.s32 %r390, %r389, %r4;
+ add.s32 %r391, %r9, -1;
+ add.s32 %r392, %r391, %r390;
+ div.s32 %r74, %r392, %r9;
+ add.s32 %r75, %r391, %r3;
+ shl.b32 %r76, %r8, 1;
+ shl.b32 %r393, %r4, 1;
+ mad.lo.s32 %r79, %r393, %r71, %r76;
or.b32 %r77, %r79, 1;
- mul.lo.s32 %r78, %r394, %r8;
- shr.u32 %r80, %r2, 5;
- mul.lo.s32 %r395, %r35, %r80;
- shr.u32 %r81, %r5, 5;
- add.s32 %r396, %r395, %r81;
- mul.wide.u32 %rd123, %r396, 4;
+ mul.lo.s32 %r78, %r393, %r9;
+ shr.u32 %r80, %r3, 5;
+ mul.lo.s32 %r394, %r35, %r80;
+ shr.u32 %r81, %r6, 5;
+ add.s32 %r395, %r394, %r81;
+ mul.wide.u32 %rd123, %r395, 4;
add.s64 %rd29, %rd44, %rd123;
- and.b32 %r82, %r5, 31;
- add.s32 %r397, %r395, %r82;
- mul.wide.u32 %rd125, %r397, 4;
+ and.b32 %r82, %r6, 31;
+ add.s32 %r396, %r394, %r82;
+ mul.wide.u32 %rd125, %r396, 4;
add.s64 %rd30, %rd44, %rd125;
- mov.u32 %r587, 0;
+ mov.u32 %r585, 0;
bra.uni $L__BB0_136;
$L__BB0_183:
- add.s32 %r587, %r587, 1;
+ add.s32 %r585, %r585, 1;
$L__BB0_136:
.pragma "nounroll";
- setp.lt.s32 %p107, %r587, %r74;
+ setp.lt.s32 %p107, %r585, %r74;
@%p107 bra $L__BB0_162;
bra.uni $L__BB0_137;
$L__BB0_162:
- div.s32 %r105, %r75, %r2;
+ div.s32 %r105, %r75, %r3;
setp.lt.s32 %p144, %r105, 1;
mov.f32 %f474, 0f00000000;
mov.f32 %f475, %f474;
@%p144 bra $L__BB0_168;
- mul.lo.s32 %r482, %r78, %r587;
- add.s32 %r106, %r77, %r482;
- add.s32 %r107, %r79, %r482;
- mov.u32 %r481, 0;
+ mul.lo.s32 %r481, %r78, %r585;
+ add.s32 %r106, %r77, %r481;
+ add.s32 %r107, %r79, %r481;
+ mov.u32 %r480, 0;
mov.f32 %f474, 0f00000000;
- mov.u32 %r594, %r481;
+ mov.u32 %r592, %r480;
$L__BB0_164:
.pragma "nounroll";
setp.ge.s32 %p145, %r106, %r155;
- mov.u32 %r595, %r481;
- mov.u32 %r596, %r481;
+ mov.u32 %r593, %r480;
+ mov.u32 %r594, %r480;
@%p145 bra $L__BB0_167;
- mad.lo.s32 %r109, %r594, %r2, %r5;
- setp.ge.s32 %p146, %r109, %r8;
- mov.u32 %r595, %r481;
- mov.u32 %r596, %r481;
+ mad.lo.s32 %r109, %r592, %r3, %r6;
+ setp.ge.s32 %p146, %r109, %r9;
+ mov.u32 %r593, %r480;
+ mov.u32 %r594, %r480;
@%p146 bra $L__BB0_167;
- mad.lo.s32 %r489, %r109, %r155, %r107;
- mul.wide.s32 %rd131, %r489, 4;
+ mad.lo.s32 %r488, %r109, %r155, %r107;
+ mul.wide.s32 %rd131, %r488, 4;
add.s64 %rd130, %rd42, %rd131;
- ld.volatile.global.v2.s32 {%r596,%r595}, [%rd130];
+ ld.volatile.global.v2.s32 {%r594,%r593}, [%rd130];
$L__BB0_167:
- mov.b32 %f386, %r596;
+ mov.b32 %f386, %r594;
add.f32 %f475, %f475, %f386;
- mov.b32 %f387, %r595;
+ mov.b32 %f387, %r593;
add.f32 %f474, %f474, %f387;
- add.s32 %r594, %r594, 1;
- setp.lt.s32 %p147, %r594, %r105;
+ add.s32 %r592, %r592, 1;
+ setp.lt.s32 %p147, %r592, %r105;
@%p147 bra $L__BB0_164;
$L__BB0_168:
- mov.b32 %r490, %f475;
- mov.u32 %r491, 31;
- mov.u32 %r492, 16;
- mov.u32 %r493, -1;
- shfl.sync.bfly.b32 %r494|%p148, %r490, %r492, %r491, %r493;
- mov.b32 %f388, %r494;
+ mov.b32 %r489, %f475;
+ mov.u32 %r490, 31;
+ mov.u32 %r491, 16;
+ mov.u32 %r492, -1;
+ shfl.sync.bfly.b32 %r493|%p148, %r489, %r491, %r490, %r492;
+ mov.b32 %f388, %r493;
add.f32 %f389, %f475, %f388;
- mov.b32 %r495, %f389;
- mov.u32 %r496, 8;
- shfl.sync.bfly.b32 %r497|%p149, %r495, %r496, %r491, %r493;
- mov.b32 %f390, %r497;
+ mov.b32 %r494, %f389;
+ mov.u32 %r495, 8;
+ shfl.sync.bfly.b32 %r496|%p149, %r494, %r495, %r490, %r492;
+ mov.b32 %f390, %r496;
add.f32 %f391, %f389, %f390;
- mov.b32 %r498, %f391;
- mov.u32 %r499, 4;
- shfl.sync.bfly.b32 %r500|%p150, %r498, %r499, %r491, %r493;
- mov.b32 %f392, %r500;
+ mov.b32 %r497, %f391;
+ mov.u32 %r498, 4;
+ shfl.sync.bfly.b32 %r499|%p150, %r497, %r498, %r490, %r492;
+ mov.b32 %f392, %r499;
add.f32 %f393, %f391, %f392;
- mov.b32 %r501, %f393;
- mov.u32 %r502, 2;
- shfl.sync.bfly.b32 %r503|%p151, %r501, %r502, %r491, %r493;
- mov.b32 %f394, %r503;
+ mov.b32 %r500, %f393;
+ mov.u32 %r501, 2;
+ shfl.sync.bfly.b32 %r502|%p151, %r500, %r501, %r490, %r492;
+ mov.b32 %f394, %r502;
add.f32 %f395, %f393, %f394;
- mov.b32 %r504, %f395;
- mov.u32 %r505, 1;
- shfl.sync.bfly.b32 %r506|%p152, %r504, %r505, %r491, %r493;
- mov.b32 %f396, %r506;
+ mov.b32 %r503, %f395;
+ mov.u32 %r504, 1;
+ shfl.sync.bfly.b32 %r505|%p152, %r503, %r504, %r490, %r492;
+ mov.b32 %f396, %r505;
add.f32 %f477, %f395, %f396;
bar.sync 0;
setp.ne.s32 %p153, %r82, 0;
@%p153 bra $L__BB0_170;
@@ -1202,70 +1200,70 @@
@%p155 bra $L__BB0_173;
ld.shared.f32 %f476, [%rd30];
$L__BB0_173:
- mov.b32 %r507, %f476;
- mov.u32 %r508, 31;
- mov.u32 %r509, 16;
- mov.u32 %r510, -1;
- shfl.sync.bfly.b32 %r511|%p156, %r507, %r509, %r508, %r510;
- mov.b32 %f398, %r511;
+ mov.b32 %r506, %f476;
+ mov.u32 %r507, 31;
+ mov.u32 %r508, 16;
+ mov.u32 %r509, -1;
+ shfl.sync.bfly.b32 %r510|%p156, %r506, %r508, %r507, %r509;
+ mov.b32 %f398, %r510;
add.f32 %f399, %f476, %f398;
- mov.b32 %r512, %f399;
- mov.u32 %r513, 8;
- shfl.sync.bfly.b32 %r514|%p157, %r512, %r513, %r508, %r510;
- mov.b32 %f400, %r514;
+ mov.b32 %r511, %f399;
+ mov.u32 %r512, 8;
+ shfl.sync.bfly.b32 %r513|%p157, %r511, %r512, %r507, %r509;
+ mov.b32 %f400, %r513;
add.f32 %f401, %f399, %f400;
- mov.b32 %r515, %f401;
- mov.u32 %r516, 4;
- shfl.sync.bfly.b32 %r517|%p158, %r515, %r516, %r508, %r510;
- mov.b32 %f402, %r517;
+ mov.b32 %r514, %f401;
+ mov.u32 %r515, 4;
+ shfl.sync.bfly.b32 %r516|%p158, %r514, %r515, %r507, %r509;
+ mov.b32 %f402, %r516;
add.f32 %f403, %f401, %f402;
- mov.b32 %r518, %f403;
- mov.u32 %r519, 2;
- shfl.sync.bfly.b32 %r520|%p159, %r518, %r519, %r508, %r510;
- mov.b32 %f404, %r520;
+ mov.b32 %r517, %f403;
+ mov.u32 %r518, 2;
+ shfl.sync.bfly.b32 %r519|%p159, %r517, %r518, %r507, %r509;
+ mov.b32 %f404, %r519;
add.f32 %f405, %f403, %f404;
- mov.b32 %r521, %f405;
- mov.u32 %r522, 1;
- shfl.sync.bfly.b32 %r523|%p160, %r521, %r522, %r508, %r510;
- mov.b32 %f406, %r523;
+ mov.b32 %r520, %f405;
+ mov.u32 %r521, 1;
+ shfl.sync.bfly.b32 %r522|%p160, %r520, %r521, %r507, %r509;
+ mov.b32 %f406, %r522;
add.f32 %f477, %f405, %f406;
$L__BB0_174:
add.f32 %f407, %f477, 0f00000000;
- mov.b32 %r524, %f407;
+ mov.b32 %r523, %f407;
setp.eq.s32 %p162, %r82, 0;
- selp.b32 %r115, %r524, 0, %p162;
- bar.sync 0;
- mov.b32 %r525, %f474;
- mov.u32 %r526, 31;
- mov.u32 %r527, 16;
- mov.u32 %r528, -1;
- shfl.sync.bfly.b32 %r529|%p163, %r525, %r527, %r526, %r528;
- mov.b32 %f408, %r529;
+ selp.b32 %r115, %r523, 0, %p162;
+ bar.sync 0;
+ mov.b32 %r524, %f474;
+ mov.u32 %r525, 31;
+ mov.u32 %r526, 16;
+ mov.u32 %r527, -1;
+ shfl.sync.bfly.b32 %r528|%p163, %r524, %r526, %r525, %r527;
+ mov.b32 %f408, %r528;
add.f32 %f409, %f474, %f408;
- mov.b32 %r530, %f409;
- mov.u32 %r531, 8;
- shfl.sync.bfly.b32 %r532|%p164, %r530, %r531, %r526, %r528;
- mov.b32 %f410, %r532;
+ mov.b32 %r529, %f409;
+ mov.u32 %r530, 8;
+ shfl.sync.bfly.b32 %r531|%p164, %r529, %r530, %r525, %r527;
+ mov.b32 %f410, %r531;
add.f32 %f411, %f409, %f410;
- mov.b32 %r533, %f411;
- mov.u32 %r534, 4;
- shfl.sync.bfly.b32 %r535|%p165, %r533, %r534, %r526, %r528;
- mov.b32 %f412, %r535;
+ mov.b32 %r532, %f411;
+ mov.u32 %r533, 4;
+ shfl.sync.bfly.b32 %r534|%p165, %r532, %r533, %r525, %r527;
+ mov.b32 %f412, %r534;
add.f32 %f413, %f411, %f412;
- mov.b32 %r536, %f413;
- mov.u32 %r537, 2;
- shfl.sync.bfly.b32 %r538|%p166, %r536, %r537, %r526, %r528;
- mov.b32 %f414, %r538;
+ mov.b32 %r535, %f413;
+ mov.u32 %r536, 2;
+ shfl.sync.bfly.b32 %r537|%p166, %r535, %r536, %r525, %r527;
+ mov.b32 %f414, %r537;
add.f32 %f415, %f413, %f414;
- mov.b32 %r539, %f415;
- mov.u32 %r540, 1;
- shfl.sync.bfly.b32 %r541|%p167, %r539, %r540, %r526, %r528;
- mov.b32 %f416, %r541;
+ mov.b32 %r538, %f415;
+ mov.u32 %r539, 1;
+ shfl.sync.bfly.b32 %r540|%p167, %r538, %r539, %r525, %r527;
+ mov.b32 %f416, %r540;
add.f32 %f479, %f415, %f416;
bar.sync 0;
@%p153 bra $L__BB0_176;
st.shared.f32 [%rd29], %f479;
@@ -1279,142 +1277,142 @@
@%p169 bra $L__BB0_179;
ld.shared.f32 %f478, [%rd30];
$L__BB0_179:
- mov.b32 %r542, %f478;
- mov.u32 %r543, 31;
- mov.u32 %r544, 16;
- mov.u32 %r545, -1;
- shfl.sync.bfly.b32 %r546|%p170, %r542, %r544, %r543, %r545;
- mov.b32 %f418, %r546;
+ mov.b32 %r541, %f478;
+ mov.u32 %r542, 31;
+ mov.u32 %r543, 16;
+ mov.u32 %r544, -1;
+ shfl.sync.bfly.b32 %r545|%p170, %r541, %r543, %r542, %r544;
+ mov.b32 %f418, %r545;
add.f32 %f419, %f478, %f418;
- mov.b32 %r547, %f419;
- mov.u32 %r548, 8;
- shfl.sync.bfly.b32 %r549|%p171, %r547, %r548, %r543, %r545;
- mov.b32 %f420, %r549;
+ mov.b32 %r546, %f419;
+ mov.u32 %r547, 8;
+ shfl.sync.bfly.b32 %r548|%p171, %r546, %r547, %r542, %r544;
+ mov.b32 %f420, %r548;
add.f32 %f421, %f419, %f420;
- mov.b32 %r550, %f421;
- mov.u32 %r551, 4;
- shfl.sync.bfly.b32 %r552|%p172, %r550, %r551, %r543, %r545;
- mov.b32 %f422, %r552;
+ mov.b32 %r549, %f421;
+ mov.u32 %r550, 4;
+ shfl.sync.bfly.b32 %r551|%p172, %r549, %r550, %r542, %r544;
+ mov.b32 %f422, %r551;
add.f32 %f423, %f421, %f422;
- mov.b32 %r553, %f423;
- mov.u32 %r554, 2;
- shfl.sync.bfly.b32 %r555|%p173, %r553, %r554, %r543, %r545;
- mov.b32 %f424, %r555;
+ mov.b32 %r552, %f423;
+ mov.u32 %r553, 2;
+ shfl.sync.bfly.b32 %r554|%p173, %r552, %r553, %r542, %r544;
+ mov.b32 %f424, %r554;
add.f32 %f425, %f423, %f424;
- mov.b32 %r556, %f425;
- mov.u32 %r557, 1;
- shfl.sync.bfly.b32 %r558|%p174, %r556, %r557, %r543, %r545;
- mov.b32 %f426, %r558;
+ mov.b32 %r555, %f425;
+ mov.u32 %r556, 1;
+ shfl.sync.bfly.b32 %r557|%p174, %r555, %r556, %r542, %r544;
+ mov.b32 %f426, %r557;
add.f32 %f479, %f425, %f426;
$L__BB0_180:
bar.sync 0;
- setp.ne.s32 %p175, %r5, 0;
+ setp.ne.s32 %p175, %r6, 0;
@%p175 bra $L__BB0_183;
- mul.lo.s32 %r116, %r78, %r587;
- add.s32 %r559, %r77, %r116;
- setp.ge.s32 %p176, %r559, %r155;
+ mul.lo.s32 %r116, %r78, %r585;
+ add.s32 %r558, %r77, %r116;
+ setp.ge.s32 %p176, %r558, %r155;
@%p176 bra $L__BB0_183;
ld.param.u64 %rd135, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r562, %r79, %r116;
- mul.wide.s32 %rd133, %r562, 4;
+ add.s32 %r561, %r79, %r116;
+ mul.wide.s32 %rd133, %r561, 4;
add.s64 %rd132, %rd135, %rd133;
add.f32 %f427, %f479, 0f00000000;
- mov.b32 %r563, %f427;
- selp.b32 %r561, %r563, 0, %p162;
-
- st.global.cs.v2.s32 [%rd132], {%r115,%r561};
+ mov.b32 %r562, %f427;
+ selp.b32 %r560, %r562, 0, %p162;
+
+ st.global.cs.v2.s32 [%rd132], {%r115,%r560};
bra.uni $L__BB0_183;
$L__BB0_137:
setp.lt.s32 %p108, %r74, 1;
@%p108 bra $L__BB0_161;
- div.s32 %r84, %r75, %r2;
- mad.lo.s32 %r85, %r155, %r5, %r76;
+ div.s32 %r84, %r75, %r3;
+ mad.lo.s32 %r85, %r155, %r6, %r76;
shl.b32 %r86, %r71, 1;
- shl.b32 %r87, %r8, 1;
- mul.lo.s32 %r88, %r155, %r2;
- mov.u32 %r588, 0;
+ shl.b32 %r87, %r9, 1;
+ mul.lo.s32 %r88, %r155, %r3;
+ mov.u32 %r586, 0;
$L__BB0_139:
.pragma "nounroll";
setp.lt.s32 %p109, %r84, 1;
mov.f32 %f466, 0f00000000;
mov.f32 %f467, %f466;
@%p109 bra $L__BB0_145;
- mad.lo.s32 %r90, %r78, %r588, %r77;
- mad.lo.s32 %r400, %r87, %r588, %r86;
- mad.lo.s32 %r590, %r3, %r400, %r85;
- mov.u32 %r399, 0;
+ mad.lo.s32 %r90, %r78, %r586, %r77;
+ mad.lo.s32 %r399, %r87, %r586, %r86;
+ mad.lo.s32 %r588, %r4, %r399, %r85;
+ mov.u32 %r398, 0;
mov.f32 %f466, 0f00000000;
- mov.u32 %r589, %r5;
- mov.u32 %r591, %r399;
+ mov.u32 %r587, %r6;
+ mov.u32 %r589, %r398;
$L__BB0_141:
.pragma "nounroll";
setp.ge.s32 %p110, %r90, %r155;
- mov.u32 %r592, %r399;
- mov.u32 %r593, %r399;
+ mov.u32 %r590, %r398;
+ mov.u32 %r591, %r398;
@%p110 bra $L__BB0_144;
- setp.ge.s32 %p111, %r589, %r8;
- mov.u32 %r592, %r399;
- mov.u32 %r593, %r399;
+ setp.ge.s32 %p111, %r587, %r9;
+ mov.u32 %r590, %r398;
+ mov.u32 %r591, %r398;
@%p111 bra $L__BB0_144;
- mul.wide.s32 %rd127, %r590, 4;
+ mul.wide.s32 %rd127, %r588, 4;
add.s64 %rd126, %rd41, %rd127;
- ld.volatile.global.v2.s32 {%r593,%r592}, [%rd126];
+ ld.volatile.global.v2.s32 {%r591,%r590}, [%rd126];
$L__BB0_144:
- mov.b32 %f340, %r593;
+ mov.b32 %f340, %r591;
add.f32 %f467, %f467, %f340;
- mov.b32 %f341, %r592;
+ mov.b32 %f341, %r590;
add.f32 %f466, %f466, %f341;
- add.s32 %r590, %r590, %r88;
- add.s32 %r589, %r589, %r2;
- add.s32 %r591, %r591, 1;
- setp.lt.s32 %p112, %r591, %r84;
+ add.s32 %r588, %r588, %r88;
+ add.s32 %r587, %r587, %r3;
+ add.s32 %r589, %r589, 1;
+ setp.lt.s32 %p112, %r589, %r84;
@%p112 bra $L__BB0_141;
$L__BB0_145:
- mov.b32 %r407, %f467;
- mov.u32 %r408, 31;
- mov.u32 %r409, 16;
- mov.u32 %r410, -1;
- shfl.sync.bfly.b32 %r411|%p113, %r407, %r409, %r408, %r410;
- mov.b32 %f342, %r411;
+ mov.b32 %r406, %f467;
+ mov.u32 %r407, 31;
+ mov.u32 %r408, 16;
+ mov.u32 %r409, -1;
+ shfl.sync.bfly.b32 %r410|%p113, %r406, %r408, %r407, %r409;
+ mov.b32 %f342, %r410;
add.f32 %f343, %f467, %f342;
- mov.b32 %r412, %f343;
- mov.u32 %r413, 8;
- shfl.sync.bfly.b32 %r414|%p114, %r412, %r413, %r408, %r410;
- mov.b32 %f344, %r414;
+ mov.b32 %r411, %f343;
+ mov.u32 %r412, 8;
+ shfl.sync.bfly.b32 %r413|%p114, %r411, %r412, %r407, %r409;
+ mov.b32 %f344, %r413;
add.f32 %f345, %f343, %f344;
- mov.b32 %r415, %f345;
- mov.u32 %r416, 4;
- shfl.sync.bfly.b32 %r417|%p115, %r415, %r416, %r408, %r410;
- mov.b32 %f346, %r417;
+ mov.b32 %r414, %f345;
+ mov.u32 %r415, 4;
+ shfl.sync.bfly.b32 %r416|%p115, %r414, %r415, %r407, %r409;
+ mov.b32 %f346, %r416;
add.f32 %f347, %f345, %f346;
- mov.b32 %r418, %f347;
- mov.u32 %r419, 2;
- shfl.sync.bfly.b32 %r420|%p116, %r418, %r419, %r408, %r410;
- mov.b32 %f348, %r420;
+ mov.b32 %r417, %f347;
+ mov.u32 %r418, 2;
+ shfl.sync.bfly.b32 %r419|%p116, %r417, %r418, %r407, %r409;
+ mov.b32 %f348, %r419;
add.f32 %f349, %f347, %f348;
- mov.b32 %r421, %f349;
- mov.u32 %r422, 1;
- shfl.sync.bfly.b32 %r423|%p117, %r421, %r422, %r408, %r410;
- mov.b32 %f350, %r423;
+ mov.b32 %r420, %f349;
+ mov.u32 %r421, 1;
+ shfl.sync.bfly.b32 %r422|%p117, %r420, %r421, %r407, %r409;
+ mov.b32 %f350, %r422;
add.f32 %f469, %f349, %f350;
bar.sync 0;
setp.ne.s32 %p118, %r82, 0;
@%p118 bra $L__BB0_147;
@@ -1430,70 +1428,70 @@
@%p120 bra $L__BB0_150;
ld.shared.f32 %f468, [%rd30];
$L__BB0_150:
- mov.b32 %r424, %f468;
- mov.u32 %r425, 31;
- mov.u32 %r426, 16;
- mov.u32 %r427, -1;
- shfl.sync.bfly.b32 %r428|%p121, %r424, %r426, %r425, %r427;
- mov.b32 %f352, %r428;
+ mov.b32 %r423, %f468;
+ mov.u32 %r424, 31;
+ mov.u32 %r425, 16;
+ mov.u32 %r426, -1;
+ shfl.sync.bfly.b32 %r427|%p121, %r423, %r425, %r424, %r426;
+ mov.b32 %f352, %r427;
add.f32 %f353, %f468, %f352;
- mov.b32 %r429, %f353;
- mov.u32 %r430, 8;
- shfl.sync.bfly.b32 %r431|%p122, %r429, %r430, %r425, %r427;
- mov.b32 %f354, %r431;
+ mov.b32 %r428, %f353;
+ mov.u32 %r429, 8;
+ shfl.sync.bfly.b32 %r430|%p122, %r428, %r429, %r424, %r426;
+ mov.b32 %f354, %r430;
add.f32 %f355, %f353, %f354;
- mov.b32 %r432, %f355;
- mov.u32 %r433, 4;
- shfl.sync.bfly.b32 %r434|%p123, %r432, %r433, %r425, %r427;
- mov.b32 %f356, %r434;
+ mov.b32 %r431, %f355;
+ mov.u32 %r432, 4;
+ shfl.sync.bfly.b32 %r433|%p123, %r431, %r432, %r424, %r426;
+ mov.b32 %f356, %r433;
add.f32 %f357, %f355, %f356;
- mov.b32 %r435, %f357;
- mov.u32 %r436, 2;
- shfl.sync.bfly.b32 %r437|%p124, %r435, %r436, %r425, %r427;
- mov.b32 %f358, %r437;
+ mov.b32 %r434, %f357;
+ mov.u32 %r435, 2;
+ shfl.sync.bfly.b32 %r436|%p124, %r434, %r435, %r424, %r426;
+ mov.b32 %f358, %r436;
add.f32 %f359, %f357, %f358;
- mov.b32 %r438, %f359;
- mov.u32 %r439, 1;
- shfl.sync.bfly.b32 %r440|%p125, %r438, %r439, %r425, %r427;
- mov.b32 %f360, %r440;
+ mov.b32 %r437, %f359;
+ mov.u32 %r438, 1;
+ shfl.sync.bfly.b32 %r439|%p125, %r437, %r438, %r424, %r426;
+ mov.b32 %f360, %r439;
add.f32 %f469, %f359, %f360;
$L__BB0_151:
add.f32 %f361, %f469, 0f00000000;
- mov.b32 %r441, %f361;
+ mov.b32 %r440, %f361;
setp.eq.s32 %p127, %r82, 0;
- selp.b32 %r102, %r441, 0, %p127;
- bar.sync 0;
- mov.b32 %r442, %f466;
- mov.u32 %r443, 31;
- mov.u32 %r444, 16;
- mov.u32 %r445, -1;
- shfl.sync.bfly.b32 %r446|%p128, %r442, %r444, %r443, %r445;
- mov.b32 %f362, %r446;
+ selp.b32 %r102, %r440, 0, %p127;
+ bar.sync 0;
+ mov.b32 %r441, %f466;
+ mov.u32 %r442, 31;
+ mov.u32 %r443, 16;
+ mov.u32 %r444, -1;
+ shfl.sync.bfly.b32 %r445|%p128, %r441, %r443, %r442, %r444;
+ mov.b32 %f362, %r445;
add.f32 %f363, %f466, %f362;
- mov.b32 %r447, %f363;
- mov.u32 %r448, 8;
- shfl.sync.bfly.b32 %r449|%p129, %r447, %r448, %r443, %r445;
- mov.b32 %f364, %r449;
+ mov.b32 %r446, %f363;
+ mov.u32 %r447, 8;
+ shfl.sync.bfly.b32 %r448|%p129, %r446, %r447, %r442, %r444;
+ mov.b32 %f364, %r448;
add.f32 %f365, %f363, %f364;
- mov.b32 %r450, %f365;
- mov.u32 %r451, 4;
- shfl.sync.bfly.b32 %r452|%p130, %r450, %r451, %r443, %r445;
- mov.b32 %f366, %r452;
+ mov.b32 %r449, %f365;
+ mov.u32 %r450, 4;
+ shfl.sync.bfly.b32 %r451|%p130, %r449, %r450, %r442, %r444;
+ mov.b32 %f366, %r451;
add.f32 %f367, %f365, %f366;
- mov.b32 %r453, %f367;
- mov.u32 %r454, 2;
- shfl.sync.bfly.b32 %r455|%p131, %r453, %r454, %r443, %r445;
- mov.b32 %f368, %r455;
+ mov.b32 %r452, %f367;
+ mov.u32 %r453, 2;
+ shfl.sync.bfly.b32 %r454|%p131, %r452, %r453, %r442, %r444;
+ mov.b32 %f368, %r454;
add.f32 %f369, %f367, %f368;
- mov.b32 %r456, %f369;
- mov.u32 %r457, 1;
- shfl.sync.bfly.b32 %r458|%p132, %r456, %r457, %r443, %r445;
- mov.b32 %f370, %r458;
+ mov.b32 %r455, %f369;
+ mov.u32 %r456, 1;
+ shfl.sync.bfly.b32 %r457|%p132, %r455, %r456, %r442, %r444;
+ mov.b32 %f370, %r457;
add.f32 %f471, %f369, %f370;
bar.sync 0;
@%p118 bra $L__BB0_153;
st.shared.f32 [%rd29], %f471;
@@ -1507,62 +1505,62 @@
@%p134 bra $L__BB0_156;
ld.shared.f32 %f470, [%rd30];
$L__BB0_156:
- mov.b32 %r459, %f470;
- mov.u32 %r460, 31;
- mov.u32 %r461, 16;
- mov.u32 %r462, -1;
- shfl.sync.bfly.b32 %r463|%p135, %r459, %r461, %r460, %r462;
- mov.b32 %f372, %r463;
+ mov.b32 %r458, %f470;
+ mov.u32 %r459, 31;
+ mov.u32 %r460, 16;
+ mov.u32 %r461, -1;
+ shfl.sync.bfly.b32 %r462|%p135, %r458, %r460, %r459, %r461;
+ mov.b32 %f372, %r462;
add.f32 %f373, %f470, %f372;
- mov.b32 %r464, %f373;
- mov.u32 %r465, 8;
- shfl.sync.bfly.b32 %r466|%p136, %r464, %r465, %r460, %r462;
- mov.b32 %f374, %r466;
+ mov.b32 %r463, %f373;
+ mov.u32 %r464, 8;
+ shfl.sync.bfly.b32 %r465|%p136, %r463, %r464, %r459, %r461;
+ mov.b32 %f374, %r465;
add.f32 %f375, %f373, %f374;
- mov.b32 %r467, %f375;
- mov.u32 %r468, 4;
- shfl.sync.bfly.b32 %r469|%p137, %r467, %r468, %r460, %r462;
- mov.b32 %f376, %r469;
+ mov.b32 %r466, %f375;
+ mov.u32 %r467, 4;
+ shfl.sync.bfly.b32 %r468|%p137, %r466, %r467, %r459, %r461;
+ mov.b32 %f376, %r468;
add.f32 %f377, %f375, %f376;
- mov.b32 %r470, %f377;
- mov.u32 %r471, 2;
- shfl.sync.bfly.b32 %r472|%p138, %r470, %r471, %r460, %r462;
- mov.b32 %f378, %r472;
+ mov.b32 %r469, %f377;
+ mov.u32 %r470, 2;
+ shfl.sync.bfly.b32 %r471|%p138, %r469, %r470, %r459, %r461;
+ mov.b32 %f378, %r471;
add.f32 %f379, %f377, %f378;
- mov.b32 %r473, %f379;
- mov.u32 %r474, 1;
- shfl.sync.bfly.b32 %r475|%p139, %r473, %r474, %r460, %r462;
- mov.b32 %f380, %r475;
+ mov.b32 %r472, %f379;
+ mov.u32 %r473, 1;
+ shfl.sync.bfly.b32 %r474|%p139, %r472, %r473, %r459, %r461;
+ mov.b32 %f380, %r474;
add.f32 %f471, %f379, %f380;
$L__BB0_157:
bar.sync 0;
- setp.ne.s32 %p140, %r5, 0;
+ setp.ne.s32 %p140, %r6, 0;
@%p140 bra $L__BB0_160;
- mul.lo.s32 %r103, %r78, %r588;
- add.s32 %r476, %r77, %r103;
- setp.ge.s32 %p141, %r476, %r155;
+ mul.lo.s32 %r103, %r78, %r586;
+ add.s32 %r475, %r77, %r103;
+ setp.ge.s32 %p141, %r475, %r155;
@%p141 bra $L__BB0_160;
ld.param.u64 %rd134, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r479, %r79, %r103;
- mul.wide.s32 %rd129, %r479, 4;
+ add.s32 %r478, %r79, %r103;
+ mul.wide.s32 %rd129, %r478, 4;
add.s64 %rd128, %rd134, %rd129;
add.f32 %f381, %f471, 0f00000000;
- mov.b32 %r480, %f381;
- selp.b32 %r478, %r480, 0, %p127;
-
- st.global.cs.v2.s32 [%rd128], {%r102,%r478};
+ mov.b32 %r479, %f381;
+ selp.b32 %r477, %r479, 0, %p127;
+
+ st.global.cs.v2.s32 [%rd128], {%r102,%r477};
$L__BB0_160:
- add.s32 %r588, %r588, 1;
- setp.lt.s32 %p143, %r588, %r74;
+ add.s32 %r586, %r586, 1;
+ setp.lt.s32 %p143, %r586, %r74;
@%p143 bra $L__BB0_139;
$L__BB0_161:
ret;
12: CombinedSchedulerTest.LayerNormBackward/dtype_float_batch_216_hidden_65536
Kernel 3
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-9
+9 index type: int
registers: 40
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T11, Tensor<float, 2, 2> T10, Tensor<float, 2, 2> T7, Tensor<float, 1, 1> T14, Tensor<float, 2, 2> T20) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
float* T28 = reinterpret_cast<float*>(array + smem_offset + 4608);
float* T25 = reinterpret_cast<float*>(array + smem_offset + 4096);
float* T24 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T7.data;
s0.logical_size = T7.logical_size;
s0.alloc_stride = T7.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 8) * 4) + 3) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T27.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(4 * i6)], &T7[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T26.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(4 * i7)], &T10[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T24[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T12;
T12[0] = 0;
T12[0]
= T25[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T17;
T17[0]
= T26[((4 * i9) + i10)]
- T12[0];
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T28[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T16;
T16[0]
= T27[((4 * i9) + i10)]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= (float) d5
* T24[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
T29[((4 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T29[(4 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T27.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(4 * i6)], &T7[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T26.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(4 * i7)], &T10[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T24[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T12;
T12[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T12[0]
= T25[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T17;
T17[0]
= T26[((4 * i9) + i10)]
- T12[0];
Array<float, 1, 1> T15;
T15[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T15[0]
= T28[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T16;
T16[0]
= T27[((4 * i9) + i10)]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= (float) d5
* T24[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
T29[((4 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T29[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T11, Tensor<float, 2, 2> T10, Tensor<float, 2, 2> T7, Tensor<float, 1, 1> T14, Tensor<float, 2, 2> T20) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
float* T28 = reinterpret_cast<float*>(array + smem_offset + 4608);
float* T25 = reinterpret_cast<float*>(array + smem_offset + 4096);
float* T24 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T7.data;
s0.logical_size = T7.logical_size;
s0.alloc_stride = T7.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T27.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(4 * i6)], &T7[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T26.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(4 * i7)], &T10[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T12;
T12[0] = 0;
T12[0]
= T25[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T17;
T17[0]
= T26[((4 * i9) + i10)]
- T12[0];
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
= T28[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T16;
T16[0]
= T27[((4 * i9) + i10)]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= (float) d5
* T24[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
T29[((4 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T29[(4 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T27.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T27[(4 * i6)], &T7[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T25[((nvfuser_index_t)threadIdx.x)]
= T11[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T26;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T26.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(4 * i7)], &T10[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T12;
T12[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T12[0]
= T25[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T17;
T17[0]
= T26[((4 * i9) + i10)]
- T12[0];
Array<float, 1, 1> T15;
T15[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T15[0]
= T28[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T16;
T16[0]
= T27[((4 * i9) + i10)]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= (float) d5
* T24[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
}
T29[((4 * i9) + i10)]
= T19[0]
* T18[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T29[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -17,11 +17,11 @@
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
- if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 8) * 4) + 3) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
+ if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T28[((nvfuser_index_t)threadIdx.x)]
= T14[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T27;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
@@ -46,11 +46,11 @@
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T26[(4 * i7)], &T10[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
- T24[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
auto& T29 = T26;
@@ -60,19 +60,19 @@
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T12;
T12[0] = 0;
T12[0]
- = T25[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T25[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T17;
T17[0]
= T26[((4 * i9) + i10)]
- T12[0];
Array<float, 1, 1> T15;
T15[0] = 0;
T15[0]
- = T28[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T28[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T16;
T16[0]
= T27[((4 * i9) + i10)]
* T15[0];
Array<float, 1, 1> T18;
@@ -81,11 +81,11 @@
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= (float) d5
- * T24[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ * T24[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
T29[((4 * i9) + i10)]
= T19[0]
* T18[0];
}
}
@@ -131,11 +131,11 @@
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
- T24[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T24[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
// Alias Allocation - register
@@ -147,21 +147,21 @@
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T12;
T12[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T12[0]
- = T25[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T25[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T17;
T17[0]
= T26[((4 * i9) + i10)]
- T12[0];
Array<float, 1, 1> T15;
T15[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T15[0]
- = T28[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T28[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T16;
T16[0]
= T27[((4 * i9) + i10)]
* T15[0];
@@ -172,11 +172,11 @@
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= (float) d5
- * T24[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ * T24[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
}
T29[((4 * i9) + i10)]
= T19[0]
* T18[0];
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_1[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5[24]
)
{
.reg .pred %p<48>;
.reg .f32 %f<257>;
.reg .b32 %r<251>;
.reg .f64 %fd<3>;
.reg .b64 %rd<61>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0+16];
ld.param.v2.u32 {%r64, %r65}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3+8];
ld.param.u64 %rd13, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5];
ld.param.u64 %rd12, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4];
ld.param.u64 %rd11, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_2];
ld.param.u64 %rd14, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_1];
cvta.to.global.u64 %rd1, %rd11;
cvta.to.global.u64 %rd2, %rd14;
ld.param.u64 %rd4, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd15, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r73, [%rd15], %r2;
ld.shared.u32 %r4, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_1033910nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r65;
rcp.rn.f64 %fd1, %fd2;
mul.wide.s32 %rd16, %r2, 4;
mov.u64 %rd17, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_2ba9a3a5_103395arrayE;
add.s64 %rd6, %rd17, %rd16;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r74, %r65, 31;
shr.s32 %r75, %r74, 31;
shr.u32 %r76, %r75, 27;
add.s32 %r77, %r74, %r76;
shr.s32 %r6, %r77, 5;
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd18, %rd12;
mul.wide.s32 %rd19, %r8, 4;
add.s64 %rd8, %rd18, %rd19;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
add.s32 %r82, %r9, %r7;
add.s32 %r83, %r82, 16;
setp.gt.s32 %p4, %r83, 215;
@%p4 bra $L__BB0_7;
and.b32 %r87, %r81, 1073741816;
sub.s32 %r88, %r2, %r87;
shl.b32 %r10, %r88, 2;
rem.s32 %r89, %r5, %r6;
shl.b32 %r11, %r89, 5;
add.s32 %r90, %r10, %r11;
or.b32 %r91, %r90, 3;
setp.ge.s32 %p5, %r91, %r65;
@%p5 bra $L__BB0_7;
shr.u32 %r93, %r79, 27;
add.s32 %r94, %r2, %r93;
and.b32 %r95, %r94, -32;
sub.s32 %r12, %r2, %r95;
add.s32 %r13, %r7, %r12;
setp.lt.s32 %p6, %r13, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f157, [%rd8];
st.shared.f32 [%rd6+4608], %f157;
shl.b32 %r220, %r4, 5;
add.s32 %r221, %r7, %r9;
add.s32 %r222, %r221, %r220;
mad.lo.s32 %r223, %r222, %r65, %r10;
add.s32 %r224, %r223, %r11;
mul.wide.s32 %rd46, %r224, 4;
add.s64 %rd40, %rd4, %rd46;
// begin inline asm
ld.global.cs.v4.u32 {%r196,%r197,%r198,%r199}, [%rd40];
// end inline asm
add.s32 %r225, %r222, 16;
mad.lo.s32 %r226, %r225, %r65, %r10;
add.s32 %r227, %r226, %r11;
mul.wide.s32 %rd47, %r227, 4;
add.s64 %rd41, %rd4, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r200,%r201,%r202,%r203}, [%rd41];
// end inline asm
shl.b64 %rd48, %rd7, 2;
add.s64 %rd49, %rd2, %rd48;
ld.global.f32 %f158, [%rd49];
st.shared.f32 [%rd6+4096], %f158;
mul.lo.s32 %r228, %r4, 96;
add.s32 %r229, %r222, %r228;
mad.lo.s32 %r230, %r229, %r65, %r10;
add.s32 %r231, %r230, %r11;
mul.wide.s32 %rd50, %r231, 4;
add.s64 %rd42, %rd3, %rd50;
// begin inline asm
ld.global.cs.v4.u32 {%r204,%r205,%r206,%r207}, [%rd42];
// end inline asm
add.s32 %r232, %r225, %r228;
mad.lo.s32 %r233, %r232, %r65, %r10;
add.s32 %r234, %r233, %r11;
mul.wide.s32 %rd51, %r234, 4;
add.s64 %rd43, %rd3, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r208,%r209,%r210,%r211}, [%rd43];
// end inline asm
mul.lo.s32 %r235, %r13, %r62;
mul.wide.s32 %rd52, %r235, 4;
add.s64 %rd53, %rd1, %rd52;
mul.wide.s32 %rd54, %r12, 4;
add.s64 %rd56, %rd17, %rd54;
ld.global.f32 %f159, [%rd53];
st.shared.f32 [%rd56], %f159;
barrier.sync 0;
mul.wide.s32 %rd57, %r9, 4;
add.s64 %rd58, %rd17, %rd57;
ld.shared.f32 %f160, [%rd58];
cvt.rn.f32.f64 %f161, %fd1;
mul.f32 %f162, %f160, %f161;
mov.b32 %f163, %r204;
ld.shared.f32 %f164, [%rd58+4096];
sub.f32 %f165, %f163, %f164;
mov.b32 %f166, %r196;
ld.shared.f32 %f167, [%rd58+4608];
mul.f32 %f168, %f167, %f166;
sub.f32 %f169, %f165, %f168;
mul.f32 %f170, %f162, %f169;
mov.b32 %r212, %f170;
mov.b32 %f171, %r205;
sub.f32 %f172, %f171, %f164;
mov.b32 %f173, %r197;
mul.f32 %f174, %f167, %f173;
sub.f32 %f175, %f172, %f174;
mul.f32 %f176, %f162, %f175;
mov.b32 %r213, %f176;
mov.b32 %f177, %r206;
sub.f32 %f178, %f177, %f164;
mov.b32 %f179, %r198;
mul.f32 %f180, %f167, %f179;
sub.f32 %f181, %f178, %f180;
mul.f32 %f182, %f162, %f181;
mov.b32 %r214, %f182;
mov.b32 %f183, %r207;
sub.f32 %f184, %f183, %f164;
mov.b32 %f185, %r199;
mul.f32 %f186, %f167, %f185;
sub.f32 %f187, %f184, %f186;
mul.f32 %f188, %f162, %f187;
mov.b32 %r215, %f188;
ld.shared.f32 %f189, [%rd58+64];
mul.f32 %f190, %f189, %f161;
mov.b32 %f191, %r208;
ld.shared.f32 %f192, [%rd58+4160];
sub.f32 %f193, %f191, %f192;
mov.b32 %f194, %r200;
ld.shared.f32 %f195, [%rd58+4672];
mul.f32 %f196, %f195, %f194;
sub.f32 %f197, %f193, %f196;
mul.f32 %f198, %f190, %f197;
mov.b32 %r216, %f198;
mov.b32 %f199, %r209;
sub.f32 %f200, %f199, %f192;
mov.b32 %f201, %r201;
mul.f32 %f202, %f195, %f201;
sub.f32 %f203, %f200, %f202;
mul.f32 %f204, %f190, %f203;
mov.b32 %r217, %f204;
mov.b32 %f205, %r210;
sub.f32 %f206, %f205, %f192;
mov.b32 %f207, %r202;
mul.f32 %f208, %f195, %f207;
sub.f32 %f209, %f206, %f208;
mul.f32 %f210, %f190, %f209;
mov.b32 %r218, %f210;
mov.b32 %f211, %r211;
sub.f32 %f212, %f211, %f192;
mov.b32 %f213, %r203;
mul.f32 %f214, %f195, %f213;
sub.f32 %f215, %f212, %f214;
mul.f32 %f216, %f190, %f215;
mov.b32 %r219, %f216;
mul.lo.s32 %r236, %r4, 896;
add.s32 %r237, %r229, %r236;
mad.lo.s32 %r238, %r237, %r65, %r10;
add.s32 %r239, %r238, %r11;
mul.wide.s32 %rd59, %r239, 4;
add.s64 %rd44, %rd13, %rd59;
// begin inline asm
st.global.cs.v4.s32 [%rd44], {%r212,%r213,%r214,%r215};
// end inline asm
add.s32 %r240, %r232, %r236;
mad.lo.s32 %r241, %r240, %r65, %r10;
add.s32 %r242, %r241, %r11;
mul.wide.s32 %rd60, %r242, 4;
add.s64 %rd45, %rd13, %rd60;
// begin inline asm
st.global.cs.v4.s32 [%rd45], {%r216,%r217,%r218,%r219};
// end inline asm
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f90, [%rd8];
st.shared.f32 [%rd6+4608], %f90;
$L__BB0_9:
mov.u32 %r14, %ctaid.x;
add.s32 %r100, %r65, 31;
shr.s32 %r101, %r100, 31;
shr.u32 %r102, %r101, 27;
add.s32 %r103, %r100, %r102;
shr.s32 %r15, %r103, 5;
shl.b32 %r16, %r4, 5;
shr.s32 %r104, %r2, 31;
shr.u32 %r105, %r104, 29;
add.s32 %r106, %r2, %r105;
and.b32 %r107, %r106, 1073741816;
sub.s32 %r108, %r2, %r107;
shl.b32 %r109, %r108, 2;
rem.s32 %r110, %r14, %r15;
shl.b32 %r111, %r110, 5;
add.s32 %r20, %r111, %r109;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r65;
shr.s32 %r18, %r106, 3;
add.s32 %r19, %r18, -216;
mov.u32 %r243, 0;
mov.u32 %r244, %r243;
mov.u32 %r245, %r243;
mov.u32 %r246, %r243;
@%p8 bra $L__BB0_12;
div.s32 %r116, %r14, %r15;
shl.b32 %r21, %r116, 5;
add.s32 %r117, %r19, %r21;
neg.s32 %r118, %r16;
setp.ge.s32 %p9, %r117, %r118;
@%p9 bra $L__BB0_12;
add.s32 %r123, %r16, %r18;
add.s32 %r124, %r123, %r21;
mad.lo.s32 %r125, %r124, %r65, %r20;
mul.wide.s32 %rd21, %r125, 4;
add.s64 %rd20, %rd4, %rd21;
// begin inline asm
ld.global.cs.v4.u32 {%r246,%r245,%r244,%r243}, [%rd20];
// end inline asm
$L__BB0_12:
mov.f32 %f225, 0f00000000;
mov.f32 %f226, 0f00000000;
mov.f32 %f227, 0f00000000;
mov.f32 %f228, 0f00000000;
@%p8 bra $L__BB0_15;
div.s32 %r126, %r14, %r15;
shl.b32 %r30, %r126, 5;
add.s32 %r127, %r19, %r30;
mov.u32 %r128, -16;
sub.s32 %r129, %r128, %r16;
setp.ge.s32 %p11, %r127, %r129;
@%p11 bra $L__BB0_15;
add.s32 %r134, %r16, %r18;
add.s32 %r135, %r134, %r30;
add.s32 %r136, %r135, 16;
mad.lo.s32 %r137, %r136, %r65, %r20;
mul.wide.s32 %rd23, %r137, 4;
add.s64 %rd22, %rd4, %rd23;
// begin inline asm
ld.global.cs.v4.u32 {%r130,%r131,%r132,%r133}, [%rd22];
// end inline asm
mov.b32 %f228, %r130;
mov.b32 %f227, %r131;
mov.b32 %f226, %r132;
mov.b32 %f225, %r133;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
div.s32 %r138, %r14, %r15;
shl.b32 %r139, %r138, 5;
add.s32 %r32, %r139, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
mul.wide.s32 %rd24, %r32, 4;
add.s64 %rd25, %rd2, %rd24;
ld.global.f32 %f99, [%rd25];
st.shared.f32 [%rd6+4096], %f99;
$L__BB0_18:
mov.u32 %r247, 0;
mov.u32 %r248, %r247;
mov.u32 %r249, %r247;
mov.u32 %r250, %r247;
@%p8 bra $L__BB0_21;
div.s32 %r148, %r14, %r15;
shl.b32 %r33, %r148, 5;
add.s32 %r149, %r19, %r33;
neg.s32 %r150, %r31;
setp.ge.s32 %p15, %r149, %r150;
@%p15 bra $L__BB0_21;
add.s32 %r155, %r31, %r18;
add.s32 %r156, %r155, %r33;
mad.lo.s32 %r157, %r156, %r65, %r20;
mul.wide.s32 %rd27, %r157, 4;
add.s64 %rd26, %rd3, %rd27;
// begin inline asm
ld.global.cs.v4.u32 {%r250,%r249,%r248,%r247}, [%rd26];
// end inline asm
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r65;
mov.f32 %f229, 0f00000000;
mov.f32 %f230, 0f00000000;
mov.f32 %f231, 0f00000000;
mov.f32 %f232, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
div.s32 %r158, %r14, %r15;
shl.b32 %r42, %r158, 5;
add.s32 %r159, %r19, %r42;
mov.u32 %r160, -16;
sub.s32 %r161, %r160, %r31;
setp.ge.s32 %p17, %r159, %r161;
@%p17 bra $L__BB0_24;
add.s32 %r166, %r31, %r18;
add.s32 %r167, %r166, %r42;
add.s32 %r168, %r167, 16;
mad.lo.s32 %r169, %r168, %r65, %r20;
mul.wide.s32 %rd29, %r169, 4;
add.s64 %rd28, %rd3, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r162,%r163,%r164,%r165}, [%rd28];
// end inline asm
mov.b32 %f232, %r162;
mov.b32 %f231, %r163;
mov.b32 %f230, %r164;
mov.b32 %f229, %r165;
$L__BB0_24:
div.s32 %r170, %r14, %r15;
shl.b32 %r43, %r170, 5;
shr.u32 %r172, %r104, 27;
add.s32 %r173, %r2, %r172;
and.b32 %r174, %r173, -32;
sub.s32 %r44, %r2, %r174;
add.s32 %r175, %r43, %r44;
setp.gt.s32 %p18, %r175, 215;
mul.lo.s32 %r176, %r175, %r62;
mul.wide.s32 %rd30, %r176, 4;
add.s64 %rd9, %rd1, %rd30;
@%p18 bra $L__BB0_26;
mul.wide.s32 %rd31, %r44, 4;
add.s64 %rd33, %rd17, %rd31;
ld.global.f32 %f108, [%rd9];
st.shared.f32 [%rd33], %f108;
$L__BB0_26:
shl.b32 %r45, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
neg.s32 %r46, %r45;
add.s32 %r47, %r19, %r43;
setp.ge.s32 %p19, %r47, %r46;
mul.wide.s32 %rd34, %r18, 4;
add.s64 %rd10, %rd17, %rd34;
mov.f32 %f234, 0f00000000;
mov.f32 %f233, %f234;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f233, [%rd10+4096];
$L__BB0_28:
mov.b32 %f111, %r250;
sub.f32 %f20, %f111, %f233;
@%p19 bra $L__BB0_30;
ld.shared.f32 %f234, [%rd10+4608];
$L__BB0_30:
mov.b32 %f113, %r246;
mul.f32 %f114, %f234, %f113;
sub.f32 %f23, %f20, %f114;
mov.f32 %f236, 0f00000000;
mov.f32 %f235, %f236;
@%p19 bra $L__BB0_32;
ld.shared.f32 %f115, [%rd10];
mul.f32 %f235, %f115, %f17;
$L__BB0_32:
mul.f32 %f26, %f23, %f235;
@%p19 bra $L__BB0_34;
ld.shared.f32 %f236, [%rd10+4096];
$L__BB0_34:
mov.b32 %f118, %r249;
sub.f32 %f29, %f118, %f236;
mov.f32 %f238, 0f00000000;
mov.f32 %f237, %f238;
@%p19 bra $L__BB0_36;
ld.shared.f32 %f237, [%rd10+4608];
$L__BB0_36:
mov.b32 %f120, %r245;
mul.f32 %f121, %f237, %f120;
sub.f32 %f32, %f29, %f121;
@%p19 bra $L__BB0_38;
ld.shared.f32 %f122, [%rd10];
mul.f32 %f238, %f122, %f17;
$L__BB0_38:
mul.f32 %f35, %f32, %f238;
mov.f32 %f240, 0f00000000;
mov.f32 %f239, %f240;
@%p19 bra $L__BB0_40;
ld.shared.f32 %f239, [%rd10+4096];
$L__BB0_40:
mov.b32 %f125, %r248;
sub.f32 %f38, %f125, %f239;
@%p19 bra $L__BB0_42;
ld.shared.f32 %f240, [%rd10+4608];
$L__BB0_42:
mov.b32 %f127, %r244;
mul.f32 %f128, %f240, %f127;
sub.f32 %f41, %f38, %f128;
mov.f32 %f242, 0f00000000;
mov.f32 %f241, %f242;
@%p19 bra $L__BB0_44;
ld.shared.f32 %f129, [%rd10];
mul.f32 %f241, %f129, %f17;
$L__BB0_44:
mul.f32 %f44, %f41, %f241;
@%p19 bra $L__BB0_46;
ld.shared.f32 %f242, [%rd10+4096];
$L__BB0_46:
mov.b32 %f132, %r247;
sub.f32 %f47, %f132, %f242;
mov.f32 %f244, 0f00000000;
mov.f32 %f243, %f244;
@%p19 bra $L__BB0_48;
ld.shared.f32 %f243, [%rd10+4608];
$L__BB0_48:
mov.b32 %f134, %r243;
mul.f32 %f135, %f243, %f134;
sub.f32 %f50, %f47, %f135;
@%p19 bra $L__BB0_50;
ld.shared.f32 %f136, [%rd10];
mul.f32 %f244, %f136, %f17;
$L__BB0_50:
mul.f32 %f53, %f50, %f244;
mov.u32 %r177, -16;
sub.s32 %r48, %r177, %r45;
setp.ge.s32 %p31, %r47, %r48;
mov.f32 %f246, 0f00000000;
mov.f32 %f245, %f246;
@%p31 bra $L__BB0_52;
ld.shared.f32 %f245, [%rd10+4160];
$L__BB0_52:
sub.f32 %f56, %f232, %f245;
@%p31 bra $L__BB0_54;
ld.shared.f32 %f246, [%rd10+4672];
$L__BB0_54:
mul.f32 %f140, %f246, %f228;
sub.f32 %f59, %f56, %f140;
mov.f32 %f248, 0f00000000;
mov.f32 %f247, %f248;
@%p31 bra $L__BB0_56;
ld.shared.f32 %f141, [%rd10+64];
mul.f32 %f247, %f141, %f17;
$L__BB0_56:
mul.f32 %f62, %f59, %f247;
@%p31 bra $L__BB0_58;
ld.shared.f32 %f248, [%rd10+4160];
$L__BB0_58:
sub.f32 %f65, %f231, %f248;
mov.f32 %f250, 0f00000000;
mov.f32 %f249, %f250;
@%p31 bra $L__BB0_60;
ld.shared.f32 %f249, [%rd10+4672];
$L__BB0_60:
mul.f32 %f145, %f249, %f227;
sub.f32 %f68, %f65, %f145;
@%p31 bra $L__BB0_62;
ld.shared.f32 %f146, [%rd10+64];
mul.f32 %f250, %f146, %f17;
$L__BB0_62:
mul.f32 %f71, %f68, %f250;
mov.f32 %f252, 0f00000000;
mov.f32 %f251, %f252;
@%p31 bra $L__BB0_64;
ld.shared.f32 %f251, [%rd10+4160];
$L__BB0_64:
sub.f32 %f74, %f230, %f251;
@%p31 bra $L__BB0_66;
ld.shared.f32 %f252, [%rd10+4672];
$L__BB0_66:
mul.f32 %f150, %f252, %f226;
sub.f32 %f77, %f74, %f150;
mov.f32 %f254, 0f00000000;
mov.f32 %f253, %f254;
@%p31 bra $L__BB0_68;
ld.shared.f32 %f151, [%rd10+64];
mul.f32 %f253, %f151, %f17;
$L__BB0_68:
mul.f32 %f80, %f77, %f253;
@%p31 bra $L__BB0_70;
ld.shared.f32 %f254, [%rd10+4160];
$L__BB0_70:
sub.f32 %f83, %f229, %f254;
mov.f32 %f256, 0f00000000;
mov.f32 %f255, %f256;
@%p31 bra $L__BB0_72;
ld.shared.f32 %f255, [%rd10+4672];
$L__BB0_72:
mul.f32 %f155, %f255, %f225;
sub.f32 %f86, %f83, %f155;
@%p31 bra $L__BB0_74;
ld.shared.f32 %f156, [%rd10+64];
mul.f32 %f256, %f156, %f17;
$L__BB0_74:
mul.f32 %f89, %f86, %f256;
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
neg.s32 %r178, %r49;
setp.ge.s32 %p44, %r47, %r178;
@%p44 bra $L__BB0_77;
add.s32 %r183, %r49, %r18;
add.s32 %r184, %r183, %r43;
mad.lo.s32 %r185, %r184, %r65, %r20;
mul.wide.s32 %rd37, %r185, 4;
add.s64 %rd36, %rd13, %rd37;
mov.b32 %r182, %f53;
mov.b32 %r180, %f35;
mov.b32 %r179, %f26;
mov.b32 %r181, %f44;
// begin inline asm
st.global.cs.v4.s32 [%rd36], {%r179,%r180,%r181,%r182};
// end inline asm
$L__BB0_77:
mov.u32 %r186, -16;
sub.s32 %r187, %r186, %r49;
setp.ge.s32 %p46, %r47, %r187;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
add.s32 %r192, %r49, %r18;
add.s32 %r193, %r192, %r43;
add.s32 %r194, %r193, 16;
mad.lo.s32 %r195, %r194, %r65, %r20;
mul.wide.s32 %rd39, %r195, 4;
add.s64 %rd38, %rd13, %rd39;
mov.b32 %r191, %f89;
mov.b32 %r189, %f71;
mov.b32 %r188, %f62;
mov.b32 %r190, %f80;
// begin inline asm
st.global.cs.v4.s32 [%rd38], {%r188,%r189,%r190,%r191};
// end inline asm
$L__BB0_80:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_1[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5[24]
)
{
.reg .pred %p<48>;
.reg .f32 %f<269>;
.reg .b32 %r<274>;
.reg .f64 %fd<3>;
.reg .b64 %rd<69>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0+16];
ld.param.v2.u32 {%r64, %r65}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3+8];
ld.param.u64 %rd15, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5];
ld.param.u64 %rd14, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4];
ld.param.u64 %rd13, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0];
ld.param.u64 %rd3, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_2];
ld.param.u64 %rd16, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_1];
cvta.to.global.u64 %rd1, %rd13;
cvta.to.global.u64 %rd2, %rd16;
ld.param.u64 %rd4, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd17, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
atom.shared.min.s32 %r73, [%rd17], %r2;
ld.shared.u32 %r4, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_723310nvfuser_26ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r65;
rcp.rn.f64 %fd1, %fd2;
mul.wide.s32 %rd18, %r2, 4;
mov.u64 %rd19, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_26_cu_16ee897e_72335arrayE;
add.s64 %rd6, %rd19, %rd18;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r74, %r65, 31;
shr.s32 %r75, %r74, 31;
shr.u32 %r76, %r75, 27;
add.s32 %r77, %r74, %r76;
shr.s32 %r6, %r77, 5;
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd20, %rd14;
mul.wide.s32 %rd21, %r8, 4;
add.s64 %rd8, %rd20, %rd21;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
add.s32 %r10, %r9, 16;
add.s32 %r82, %r10, %r7;
setp.gt.s32 %p4, %r82, 215;
@%p4 bra $L__BB0_7;
rem.s32 %r83, %r5, %r6;
shl.b32 %r11, %r83, 5;
or.b32 %r84, %r11, 31;
setp.ge.s32 %p5, %r84, %r65;
@%p5 bra $L__BB0_7;
shr.u32 %r86, %r79, 27;
add.s32 %r87, %r2, %r86;
and.b32 %r88, %r87, -32;
sub.s32 %r89, %r2, %r88;
add.s32 %r12, %r7, %r89;
setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f157, [%rd8];
st.shared.f32 [%rd6+4608], %f157;
and.b32 %r228, %r81, -8;
sub.s32 %r229, %r2, %r228;
shl.b32 %r230, %r229, 2;
shl.b32 %r231, %r4, 5;
add.s32 %r232, %r7, %r9;
add.s32 %r233, %r232, %r231;
mad.lo.s32 %r234, %r233, %r65, %r230;
add.s32 %r235, %r234, %r11;
mul.wide.s32 %rd51, %r235, 4;
add.s64 %rd45, %rd4, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r201,%r202,%r203,%r204}, [%rd45];
// end inline asm
add.s32 %r236, %r233, 16;
mad.lo.s32 %r237, %r236, %r65, %r230;
add.s32 %r238, %r237, %r11;
mul.wide.s32 %rd52, %r238, 4;
add.s64 %rd46, %rd4, %rd52;
// begin inline asm
ld.global.cs.v4.u32 {%r205,%r206,%r207,%r208}, [%rd46];
// end inline asm
shl.b64 %rd53, %rd7, 2;
add.s64 %rd54, %rd2, %rd53;
ld.global.f32 %f158, [%rd54];
st.shared.f32 [%rd6+4096], %f158;
mul.lo.s32 %r239, %r4, 96;
add.s32 %r240, %r233, %r239;
mad.lo.s32 %r241, %r240, %r65, %r230;
add.s32 %r242, %r241, %r11;
mul.wide.s32 %rd55, %r242, 4;
add.s64 %rd47, %rd3, %rd55;
// begin inline asm
ld.global.cs.v4.u32 {%r209,%r210,%r211,%r212}, [%rd47];
// end inline asm
add.s32 %r243, %r236, %r239;
mad.lo.s32 %r244, %r243, %r65, %r230;
add.s32 %r245, %r244, %r11;
mul.wide.s32 %rd56, %r245, 4;
add.s64 %rd48, %rd3, %rd56;
// begin inline asm
ld.global.cs.v4.u32 {%r213,%r214,%r215,%r216}, [%rd48];
// end inline asm
mul.lo.s32 %r246, %r12, %r62;
mul.wide.s32 %rd57, %r246, 4;
add.s64 %rd58, %rd1, %rd57;
ld.global.f32 %f159, [%rd58];
st.shared.f32 [%rd6], %f159;
st.shared.f32 [%rd6+512], %f159;
st.shared.f32 [%rd6+1024], %f159;
st.shared.f32 [%rd6+1536], %f159;
st.shared.f32 [%rd6+2048], %f159;
st.shared.f32 [%rd6+2560], %f159;
st.shared.f32 [%rd6+3072], %f159;
st.shared.f32 [%rd6+3584], %f159;
barrier.sync 0;
shl.b32 %r247, %r229, 7;
add.s32 %r248, %r247, %r9;
shr.s32 %r249, %r9, 31;
shr.u32 %r250, %r249, 25;
add.s32 %r251, %r9, %r250;
and.b32 %r252, %r251, -128;
sub.s32 %r253, %r9, %r252;
mul.wide.s32 %rd59, %r253, 4;
add.s64 %rd61, %rd19, 4096;
add.s64 %rd62, %rd61, %rd59;
mov.b32 %f160, %r209;
ld.shared.f32 %f161, [%rd62];
sub.f32 %f162, %f160, %f161;
mov.b32 %f163, %r201;
ld.shared.f32 %f164, [%rd62+512];
mul.f32 %f165, %f164, %f163;
sub.f32 %f166, %f162, %f165;
mul.wide.s32 %rd63, %r248, 4;
add.s64 %rd64, %rd19, %rd63;
ld.shared.f32 %f167, [%rd64];
cvt.rn.f32.f64 %f168, %fd1;
mul.f32 %f169, %f167, %f168;
mul.f32 %f170, %f169, %f166;
mov.b32 %r217, %f170;
mov.b32 %f171, %r210;
sub.f32 %f172, %f171, %f161;
mov.b32 %f173, %r202;
mul.f32 %f174, %f164, %f173;
sub.f32 %f175, %f172, %f174;
ld.shared.f32 %f176, [%rd64+128];
mul.f32 %f177, %f176, %f168;
mul.f32 %f178, %f177, %f175;
mov.b32 %r218, %f178;
mov.b32 %f179, %r211;
sub.f32 %f180, %f179, %f161;
mov.b32 %f181, %r203;
mul.f32 %f182, %f164, %f181;
sub.f32 %f183, %f180, %f182;
ld.shared.f32 %f184, [%rd64+256];
mul.f32 %f185, %f184, %f168;
mul.f32 %f186, %f185, %f183;
mov.b32 %r219, %f186;
mov.b32 %f187, %r212;
sub.f32 %f188, %f187, %f161;
mov.b32 %f189, %r204;
mul.f32 %f190, %f164, %f189;
sub.f32 %f191, %f188, %f190;
ld.shared.f32 %f192, [%rd64+384];
mul.f32 %f193, %f192, %f168;
mul.f32 %f194, %f193, %f191;
mov.b32 %r220, %f194;
shr.s32 %r254, %r10, 31;
shr.u32 %r255, %r254, 25;
add.s32 %r256, %r10, %r255;
and.b32 %r257, %r256, -128;
sub.s32 %r258, %r10, %r257;
mul.wide.s32 %rd65, %r258, 4;
add.s64 %rd66, %rd61, %rd65;
mov.b32 %f195, %r213;
ld.shared.f32 %f196, [%rd66];
sub.f32 %f197, %f195, %f196;
mov.b32 %f198, %r205;
ld.shared.f32 %f199, [%rd66+512];
mul.f32 %f200, %f199, %f198;
sub.f32 %f201, %f197, %f200;
ld.shared.f32 %f202, [%rd64+64];
mul.f32 %f203, %f202, %f168;
mul.f32 %f204, %f203, %f201;
mov.b32 %r221, %f204;
mov.b32 %f205, %r214;
sub.f32 %f206, %f205, %f196;
mov.b32 %f207, %r206;
mul.f32 %f208, %f199, %f207;
sub.f32 %f209, %f206, %f208;
ld.shared.f32 %f210, [%rd64+192];
mul.f32 %f211, %f210, %f168;
mul.f32 %f212, %f211, %f209;
mov.b32 %r222, %f212;
mov.b32 %f213, %r215;
sub.f32 %f214, %f213, %f196;
mov.b32 %f215, %r207;
mul.f32 %f216, %f199, %f215;
sub.f32 %f217, %f214, %f216;
ld.shared.f32 %f218, [%rd64+320];
mul.f32 %f219, %f218, %f168;
mul.f32 %f220, %f219, %f217;
mov.b32 %r223, %f220;
mov.b32 %f221, %r216;
sub.f32 %f222, %f221, %f196;
mov.b32 %f223, %r208;
mul.f32 %f224, %f199, %f223;
sub.f32 %f225, %f222, %f224;
ld.shared.f32 %f226, [%rd64+448];
mul.f32 %f227, %f226, %f168;
mul.f32 %f228, %f227, %f225;
mov.b32 %r224, %f228;
shl.b32 %r259, %r4, 10;
add.s32 %r260, %r232, %r259;
mad.lo.s32 %r261, %r260, %r65, %r230;
add.s32 %r262, %r261, %r11;
mul.wide.s32 %rd67, %r262, 4;
add.s64 %rd49, %rd15, %rd67;
// begin inline asm
st.global.cs.v4.s32 [%rd49], {%r217,%r218,%r219,%r220};
// end inline asm
add.s32 %r263, %r260, 16;
mad.lo.s32 %r264, %r263, %r65, %r230;
add.s32 %r265, %r264, %r11;
mul.wide.s32 %rd68, %r265, 4;
add.s64 %rd50, %rd15, %rd68;
// begin inline asm
st.global.cs.v4.s32 [%rd50], {%r221,%r222,%r223,%r224};
// end inline asm
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f90, [%rd8];
st.shared.f32 [%rd6+4608], %f90;
$L__BB0_9:
mov.u32 %r13, %ctaid.x;
add.s32 %r94, %r65, 31;
shr.s32 %r95, %r94, 31;
shr.u32 %r96, %r95, 27;
add.s32 %r97, %r94, %r96;
shr.s32 %r14, %r97, 5;
shl.b32 %r15, %r4, 5;
shr.s32 %r98, %r2, 31;
shr.u32 %r99, %r98, 29;
add.s32 %r100, %r2, %r99;
and.b32 %r101, %r100, -8;
sub.s32 %r16, %r2, %r101;
shl.b32 %r102, %r16, 2;
rem.s32 %r103, %r13, %r14;
shl.b32 %r104, %r103, 5;
add.s32 %r20, %r104, %r102;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r65;
shr.s32 %r18, %r100, 3;
add.s32 %r19, %r18, -216;
mov.u32 %r266, 0;
mov.u32 %r267, %r266;
mov.u32 %r268, %r266;
mov.u32 %r269, %r266;
@%p8 bra $L__BB0_12;
div.s32 %r109, %r13, %r14;
shl.b32 %r21, %r109, 5;
add.s32 %r110, %r19, %r21;
neg.s32 %r111, %r15;
setp.ge.s32 %p9, %r110, %r111;
@%p9 bra $L__BB0_12;
add.s32 %r116, %r15, %r18;
add.s32 %r117, %r116, %r21;
mad.lo.s32 %r118, %r117, %r65, %r20;
mul.wide.s32 %rd23, %r118, 4;
add.s64 %rd22, %rd4, %rd23;
// begin inline asm
ld.global.cs.v4.u32 {%r269,%r268,%r267,%r266}, [%rd22];
// end inline asm
$L__BB0_12:
mov.f32 %f237, 0f00000000;
mov.f32 %f238, 0f00000000;
mov.f32 %f239, 0f00000000;
mov.f32 %f240, 0f00000000;
@%p8 bra $L__BB0_15;
div.s32 %r119, %r13, %r14;
shl.b32 %r30, %r119, 5;
add.s32 %r120, %r19, %r30;
mov.u32 %r121, -16;
sub.s32 %r122, %r121, %r15;
setp.ge.s32 %p11, %r120, %r122;
@%p11 bra $L__BB0_15;
add.s32 %r127, %r15, %r18;
add.s32 %r128, %r127, %r30;
add.s32 %r129, %r128, 16;
mad.lo.s32 %r130, %r129, %r65, %r20;
mul.wide.s32 %rd25, %r130, 4;
add.s64 %rd24, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r123,%r124,%r125,%r126}, [%rd24];
// end inline asm
mov.b32 %f240, %r123;
mov.b32 %f239, %r124;
mov.b32 %f238, %r125;
mov.b32 %f237, %r126;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
div.s32 %r131, %r13, %r14;
shl.b32 %r132, %r131, 5;
add.s32 %r32, %r132, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
mul.wide.s32 %rd26, %r32, 4;
add.s64 %rd27, %rd2, %rd26;
ld.global.f32 %f99, [%rd27];
st.shared.f32 [%rd6+4096], %f99;
$L__BB0_18:
mov.u32 %r270, 0;
mov.u32 %r271, %r270;
mov.u32 %r272, %r270;
mov.u32 %r273, %r270;
@%p8 bra $L__BB0_21;
div.s32 %r141, %r13, %r14;
shl.b32 %r33, %r141, 5;
add.s32 %r142, %r19, %r33;
neg.s32 %r143, %r31;
setp.ge.s32 %p15, %r142, %r143;
@%p15 bra $L__BB0_21;
add.s32 %r148, %r31, %r18;
add.s32 %r149, %r148, %r33;
mad.lo.s32 %r150, %r149, %r65, %r20;
mul.wide.s32 %rd29, %r150, 4;
add.s64 %rd28, %rd3, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r273,%r272,%r271,%r270}, [%rd28];
// end inline asm
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r65;
mov.f32 %f241, 0f00000000;
mov.f32 %f242, 0f00000000;
mov.f32 %f243, 0f00000000;
mov.f32 %f244, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
div.s32 %r151, %r13, %r14;
shl.b32 %r42, %r151, 5;
add.s32 %r152, %r19, %r42;
mov.u32 %r153, -16;
sub.s32 %r154, %r153, %r31;
setp.ge.s32 %p17, %r152, %r154;
@%p17 bra $L__BB0_24;
add.s32 %r159, %r31, %r18;
add.s32 %r160, %r159, %r42;
add.s32 %r161, %r160, 16;
mad.lo.s32 %r162, %r161, %r65, %r20;
mul.wide.s32 %rd31, %r162, 4;
add.s64 %rd30, %rd3, %rd31;
// begin inline asm
ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd30];
// end inline asm
mov.b32 %f244, %r155;
mov.b32 %f243, %r156;
mov.b32 %f242, %r157;
mov.b32 %f241, %r158;
$L__BB0_24:
div.s32 %r163, %r13, %r14;
shl.b32 %r43, %r163, 5;
shr.u32 %r165, %r98, 27;
add.s32 %r166, %r2, %r165;
and.b32 %r167, %r166, -32;
sub.s32 %r168, %r2, %r167;
add.s32 %r169, %r43, %r168;
setp.gt.s32 %p18, %r169, 215;
mul.lo.s32 %r170, %r169, %r62;
mul.wide.s32 %rd32, %r170, 4;
add.s64 %rd9, %rd1, %rd32;
@%p18 bra $L__BB0_26;
ld.global.f32 %f108, [%rd9];
st.shared.f32 [%rd6], %f108;
st.shared.f32 [%rd6+512], %f108;
st.shared.f32 [%rd6+1024], %f108;
st.shared.f32 [%rd6+1536], %f108;
st.shared.f32 [%rd6+2048], %f108;
st.shared.f32 [%rd6+2560], %f108;
st.shared.f32 [%rd6+3072], %f108;
st.shared.f32 [%rd6+3584], %f108;
$L__BB0_26:
shl.b32 %r44, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
neg.s32 %r45, %r44;
add.s32 %r46, %r19, %r43;
setp.ge.s32 %p19, %r46, %r45;
shr.s32 %r171, %r18, 31;
shr.u32 %r172, %r171, 25;
add.s32 %r173, %r18, %r172;
and.b32 %r174, %r173, -128;
sub.s32 %r175, %r18, %r174;
mul.wide.s32 %rd33, %r175, 4;
add.s64 %rd35, %rd19, %rd33;
add.s64 %rd10, %rd35, 4096;
mov.f32 %f246, 0f00000000;
mov.f32 %f245, %f246;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f245, [%rd10];
$L__BB0_28:
mov.b32 %f111, %r273;
sub.f32 %f20, %f111, %f245;
@%p19 bra $L__BB0_30;
ld.shared.f32 %f246, [%rd10+512];
$L__BB0_30:
mov.b32 %f113, %r269;
mul.f32 %f114, %f246, %f113;
sub.f32 %f23, %f20, %f114;
shl.b32 %r176, %r16, 7;
add.s32 %r177, %r176, %r18;
mul.wide.s32 %rd36, %r177, 4;
add.s64 %rd11, %rd19, %rd36;
mov.f32 %f248, 0f00000000;
mov.f32 %f247, %f248;
@%p19 bra $L__BB0_32;
ld.shared.f32 %f115, [%rd11];
mul.f32 %f247, %f115, %f17;
$L__BB0_32:
mul.f32 %f26, %f23, %f247;
@%p19 bra $L__BB0_34;
ld.shared.f32 %f248, [%rd10];
$L__BB0_34:
mov.b32 %f118, %r272;
sub.f32 %f29, %f118, %f248;
mov.f32 %f250, 0f00000000;
mov.f32 %f249, %f250;
@%p19 bra $L__BB0_36;
ld.shared.f32 %f249, [%rd10+512];
$L__BB0_36:
mov.b32 %f120, %r268;
mul.f32 %f121, %f249, %f120;
sub.f32 %f32, %f29, %f121;
@%p19 bra $L__BB0_38;
ld.shared.f32 %f122, [%rd11+128];
mul.f32 %f250, %f122, %f17;
$L__BB0_38:
mul.f32 %f35, %f32, %f250;
mov.f32 %f252, 0f00000000;
mov.f32 %f251, %f252;
@%p19 bra $L__BB0_40;
ld.shared.f32 %f251, [%rd10];
$L__BB0_40:
mov.b32 %f125, %r271;
sub.f32 %f38, %f125, %f251;
@%p19 bra $L__BB0_42;
ld.shared.f32 %f252, [%rd10+512];
$L__BB0_42:
mov.b32 %f127, %r267;
mul.f32 %f128, %f252, %f127;
sub.f32 %f41, %f38, %f128;
mov.f32 %f254, 0f00000000;
mov.f32 %f253, %f254;
@%p19 bra $L__BB0_44;
ld.shared.f32 %f129, [%rd11+256];
mul.f32 %f253, %f129, %f17;
$L__BB0_44:
mul.f32 %f44, %f41, %f253;
@%p19 bra $L__BB0_46;
ld.shared.f32 %f254, [%rd10];
$L__BB0_46:
mov.b32 %f132, %r270;
sub.f32 %f47, %f132, %f254;
mov.f32 %f256, 0f00000000;
mov.f32 %f255, %f256;
@%p19 bra $L__BB0_48;
ld.shared.f32 %f255, [%rd10+512];
$L__BB0_48:
mov.b32 %f134, %r266;
mul.f32 %f135, %f255, %f134;
sub.f32 %f50, %f47, %f135;
@%p19 bra $L__BB0_50;
ld.shared.f32 %f136, [%rd11+384];
mul.f32 %f256, %f136, %f17;
$L__BB0_50:
mul.f32 %f53, %f50, %f256;
mov.u32 %r178, -16;
sub.s32 %r47, %r178, %r44;
setp.ge.s32 %p31, %r46, %r47;
add.s32 %r48, %r18, 16;
shr.s32 %r179, %r48, 31;
shr.u32 %r180, %r179, 25;
add.s32 %r181, %r48, %r180;
and.b32 %r182, %r181, -128;
sub.s32 %r183, %r48, %r182;
mul.wide.s32 %rd38, %r183, 4;
add.s64 %rd40, %rd19, %rd38;
add.s64 %rd12, %rd40, 4096;
mov.f32 %f258, 0f00000000;
mov.f32 %f257, %f258;
@%p31 bra $L__BB0_52;
ld.shared.f32 %f257, [%rd12];
$L__BB0_52:
sub.f32 %f56, %f244, %f257;
@%p31 bra $L__BB0_54;
ld.shared.f32 %f258, [%rd12+512];
$L__BB0_54:
mul.f32 %f140, %f258, %f240;
sub.f32 %f59, %f56, %f140;
mov.f32 %f260, 0f00000000;
mov.f32 %f259, %f260;
@%p31 bra $L__BB0_56;
ld.shared.f32 %f141, [%rd11+64];
mul.f32 %f259, %f141, %f17;
$L__BB0_56:
mul.f32 %f62, %f59, %f259;
@%p31 bra $L__BB0_58;
ld.shared.f32 %f260, [%rd12];
$L__BB0_58:
sub.f32 %f65, %f243, %f260;
mov.f32 %f262, 0f00000000;
mov.f32 %f261, %f262;
@%p31 bra $L__BB0_60;
ld.shared.f32 %f261, [%rd12+512];
$L__BB0_60:
mul.f32 %f145, %f261, %f239;
sub.f32 %f68, %f65, %f145;
@%p31 bra $L__BB0_62;
ld.shared.f32 %f146, [%rd11+192];
mul.f32 %f262, %f146, %f17;
$L__BB0_62:
mul.f32 %f71, %f68, %f262;
mov.f32 %f264, 0f00000000;
mov.f32 %f263, %f264;
@%p31 bra $L__BB0_64;
ld.shared.f32 %f263, [%rd12];
$L__BB0_64:
sub.f32 %f74, %f242, %f263;
@%p31 bra $L__BB0_66;
ld.shared.f32 %f264, [%rd12+512];
$L__BB0_66:
mul.f32 %f150, %f264, %f238;
sub.f32 %f77, %f74, %f150;
mov.f32 %f266, 0f00000000;
mov.f32 %f265, %f266;
@%p31 bra $L__BB0_68;
ld.shared.f32 %f151, [%rd11+320];
mul.f32 %f265, %f151, %f17;
$L__BB0_68:
mul.f32 %f80, %f77, %f265;
@%p31 bra $L__BB0_70;
ld.shared.f32 %f266, [%rd12];
$L__BB0_70:
sub.f32 %f83, %f241, %f266;
mov.f32 %f268, 0f00000000;
mov.f32 %f267, %f268;
@%p31 bra $L__BB0_72;
ld.shared.f32 %f267, [%rd12+512];
$L__BB0_72:
mul.f32 %f155, %f267, %f237;
sub.f32 %f86, %f83, %f155;
@%p31 bra $L__BB0_74;
ld.shared.f32 %f156, [%rd11+448];
mul.f32 %f268, %f156, %f17;
$L__BB0_74:
mul.f32 %f89, %f86, %f268;
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
neg.s32 %r184, %r49;
setp.ge.s32 %p44, %r46, %r184;
@%p44 bra $L__BB0_77;
add.s32 %r189, %r49, %r18;
add.s32 %r190, %r189, %r43;
mad.lo.s32 %r191, %r190, %r65, %r20;
mul.wide.s32 %rd42, %r191, 4;
add.s64 %rd41, %rd15, %rd42;
mov.b32 %r188, %f53;
mov.b32 %r186, %f35;
mov.b32 %r185, %f26;
mov.b32 %r187, %f44;
// begin inline asm
st.global.cs.v4.s32 [%rd41], {%r185,%r186,%r187,%r188};
// end inline asm
$L__BB0_77:
mov.u32 %r192, -16;
sub.s32 %r193, %r192, %r49;
setp.ge.s32 %p46, %r46, %r193;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
add.s32 %r198, %r48, %r49;
add.s32 %r199, %r198, %r43;
mad.lo.s32 %r200, %r199, %r65, %r20;
mul.wide.s32 %rd44, %r200, 4;
add.s64 %rd43, %rd15, %rd44;
mov.b32 %r197, %f89;
mov.b32 %r195, %f71;
mov.b32 %r194, %f62;
mov.b32 %r196, %f80;
// begin inline asm
st.global.cs.v4.s32 [%rd43], {%r194,%r195,%r196,%r197};
// end inline asm
$L__BB0_80:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -24,44 +24,44 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4[16],
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5[24]
)
{
.reg .pred %p<48>;
- .reg .f32 %f<257>;
- .reg .b32 %r<251>;
+ .reg .f32 %f<269>;
+ .reg .b32 %r<274>;
.reg .f64 %fd<3>;
- .reg .b64 %rd<61>;
+ .reg .b64 %rd<69>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0+16];
ld.param.v2.u32 {%r64, %r65}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3+8];
- ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5];
- ld.param.u64 %rd12, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4];
- ld.param.u64 %rd11, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0];
+ ld.param.u64 %rd15, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_5];
+ ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_4];
+ ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_0];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_2];
- ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_1];
- cvta.to.global.u64 %rd1, %rd11;
- cvta.to.global.u64 %rd2, %rd14;
+ ld.param.u64 %rd16, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_1];
+ cvta.to.global.u64 %rd1, %rd13;
+ cvta.to.global.u64 %rd2, %rd16;
ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1__param_3];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd15, _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
- atom.shared.min.s32 %r73, [%rd15], %r2;
+ mov.u64 %rd17, _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s;
+ atom.shared.min.s32 %r73, [%rd17], %r2;
ld.shared.u32 %r4, [_ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES1_S1_S2_S1_E14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r65;
rcp.rn.f64 %fd1, %fd2;
- mul.wide.s32 %rd16, %r2, 4;
- mov.u64 %rd17, _ZN11kernelscope6kernelE;
- add.s64 %rd6, %rd17, %rd16;
+ mul.wide.s32 %rd18, %r2, 4;
+ mov.u64 %rd19, _ZN11kernelscope6kernelE;
+ add.s64 %rd6, %rd19, %rd18;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
@@ -74,590 +74,646 @@
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
- cvta.to.global.u64 %rd18, %rd12;
- mul.wide.s32 %rd19, %r8, 4;
- add.s64 %rd8, %rd18, %rd19;
+ cvta.to.global.u64 %rd20, %rd14;
+ mul.wide.s32 %rd21, %r8, 4;
+ add.s64 %rd8, %rd20, %rd21;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
- add.s32 %r82, %r9, %r7;
- add.s32 %r83, %r82, 16;
- setp.gt.s32 %p4, %r83, 215;
+ add.s32 %r10, %r9, 16;
+ add.s32 %r82, %r10, %r7;
+ setp.gt.s32 %p4, %r82, 215;
@%p4 bra $L__BB0_7;
- and.b32 %r87, %r81, 1073741816;
- sub.s32 %r88, %r2, %r87;
- shl.b32 %r10, %r88, 2;
- rem.s32 %r89, %r5, %r6;
- shl.b32 %r11, %r89, 5;
- add.s32 %r90, %r10, %r11;
- or.b32 %r91, %r90, 3;
- setp.ge.s32 %p5, %r91, %r65;
+ rem.s32 %r83, %r5, %r6;
+ shl.b32 %r11, %r83, 5;
+ or.b32 %r84, %r11, 31;
+ setp.ge.s32 %p5, %r84, %r65;
@%p5 bra $L__BB0_7;
- shr.u32 %r93, %r79, 27;
- add.s32 %r94, %r2, %r93;
- and.b32 %r95, %r94, -32;
- sub.s32 %r12, %r2, %r95;
- add.s32 %r13, %r7, %r12;
- setp.lt.s32 %p6, %r13, 216;
+ shr.u32 %r86, %r79, 27;
+ add.s32 %r87, %r2, %r86;
+ and.b32 %r88, %r87, -32;
+ sub.s32 %r89, %r2, %r88;
+ add.s32 %r12, %r7, %r89;
+ setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f157, [%rd8];
st.shared.f32 [%rd6+4608], %f157;
- shl.b32 %r220, %r4, 5;
- add.s32 %r221, %r7, %r9;
- add.s32 %r222, %r221, %r220;
- mad.lo.s32 %r223, %r222, %r65, %r10;
- add.s32 %r224, %r223, %r11;
- mul.wide.s32 %rd46, %r224, 4;
- add.s64 %rd40, %rd4, %rd46;
-
- ld.global.cs.v4.u32 {%r196,%r197,%r198,%r199}, [%rd40];
-
- add.s32 %r225, %r222, 16;
- mad.lo.s32 %r226, %r225, %r65, %r10;
- add.s32 %r227, %r226, %r11;
- mul.wide.s32 %rd47, %r227, 4;
- add.s64 %rd41, %rd4, %rd47;
-
- ld.global.cs.v4.u32 {%r200,%r201,%r202,%r203}, [%rd41];
-
- shl.b64 %rd48, %rd7, 2;
- add.s64 %rd49, %rd2, %rd48;
- ld.global.f32 %f158, [%rd49];
+ and.b32 %r228, %r81, -8;
+ sub.s32 %r229, %r2, %r228;
+ shl.b32 %r230, %r229, 2;
+ shl.b32 %r231, %r4, 5;
+ add.s32 %r232, %r7, %r9;
+ add.s32 %r233, %r232, %r231;
+ mad.lo.s32 %r234, %r233, %r65, %r230;
+ add.s32 %r235, %r234, %r11;
+ mul.wide.s32 %rd51, %r235, 4;
+ add.s64 %rd45, %rd4, %rd51;
+
+ ld.global.cs.v4.u32 {%r201,%r202,%r203,%r204}, [%rd45];
+
+ add.s32 %r236, %r233, 16;
+ mad.lo.s32 %r237, %r236, %r65, %r230;
+ add.s32 %r238, %r237, %r11;
+ mul.wide.s32 %rd52, %r238, 4;
+ add.s64 %rd46, %rd4, %rd52;
+
+ ld.global.cs.v4.u32 {%r205,%r206,%r207,%r208}, [%rd46];
+
+ shl.b64 %rd53, %rd7, 2;
+ add.s64 %rd54, %rd2, %rd53;
+ ld.global.f32 %f158, [%rd54];
st.shared.f32 [%rd6+4096], %f158;
- mul.lo.s32 %r228, %r4, 96;
- add.s32 %r229, %r222, %r228;
- mad.lo.s32 %r230, %r229, %r65, %r10;
- add.s32 %r231, %r230, %r11;
- mul.wide.s32 %rd50, %r231, 4;
- add.s64 %rd42, %rd3, %rd50;
-
- ld.global.cs.v4.u32 {%r204,%r205,%r206,%r207}, [%rd42];
-
- add.s32 %r232, %r225, %r228;
- mad.lo.s32 %r233, %r232, %r65, %r10;
- add.s32 %r234, %r233, %r11;
- mul.wide.s32 %rd51, %r234, 4;
- add.s64 %rd43, %rd3, %rd51;
-
- ld.global.cs.v4.u32 {%r208,%r209,%r210,%r211}, [%rd43];
-
- mul.lo.s32 %r235, %r13, %r62;
- mul.wide.s32 %rd52, %r235, 4;
- add.s64 %rd53, %rd1, %rd52;
- mul.wide.s32 %rd54, %r12, 4;
- add.s64 %rd56, %rd17, %rd54;
- ld.global.f32 %f159, [%rd53];
- st.shared.f32 [%rd56], %f159;
+ mul.lo.s32 %r239, %r4, 96;
+ add.s32 %r240, %r233, %r239;
+ mad.lo.s32 %r241, %r240, %r65, %r230;
+ add.s32 %r242, %r241, %r11;
+ mul.wide.s32 %rd55, %r242, 4;
+ add.s64 %rd47, %rd3, %rd55;
+
+ ld.global.cs.v4.u32 {%r209,%r210,%r211,%r212}, [%rd47];
+
+ add.s32 %r243, %r236, %r239;
+ mad.lo.s32 %r244, %r243, %r65, %r230;
+ add.s32 %r245, %r244, %r11;
+ mul.wide.s32 %rd56, %r245, 4;
+ add.s64 %rd48, %rd3, %rd56;
+
+ ld.global.cs.v4.u32 {%r213,%r214,%r215,%r216}, [%rd48];
+
+ mul.lo.s32 %r246, %r12, %r62;
+ mul.wide.s32 %rd57, %r246, 4;
+ add.s64 %rd58, %rd1, %rd57;
+ ld.global.f32 %f159, [%rd58];
+ st.shared.f32 [%rd6], %f159;
+ st.shared.f32 [%rd6+512], %f159;
+ st.shared.f32 [%rd6+1024], %f159;
+ st.shared.f32 [%rd6+1536], %f159;
+ st.shared.f32 [%rd6+2048], %f159;
+ st.shared.f32 [%rd6+2560], %f159;
+ st.shared.f32 [%rd6+3072], %f159;
+ st.shared.f32 [%rd6+3584], %f159;
barrier.sync 0;
- mul.wide.s32 %rd57, %r9, 4;
- add.s64 %rd58, %rd17, %rd57;
- ld.shared.f32 %f160, [%rd58];
- cvt.rn.f32.f64 %f161, %fd1;
- mul.f32 %f162, %f160, %f161;
- mov.b32 %f163, %r204;
- ld.shared.f32 %f164, [%rd58+4096];
- sub.f32 %f165, %f163, %f164;
- mov.b32 %f166, %r196;
- ld.shared.f32 %f167, [%rd58+4608];
- mul.f32 %f168, %f167, %f166;
- sub.f32 %f169, %f165, %f168;
- mul.f32 %f170, %f162, %f169;
- mov.b32 %r212, %f170;
- mov.b32 %f171, %r205;
- sub.f32 %f172, %f171, %f164;
- mov.b32 %f173, %r197;
- mul.f32 %f174, %f167, %f173;
+ shl.b32 %r247, %r229, 7;
+ add.s32 %r248, %r247, %r9;
+ shr.s32 %r249, %r9, 31;
+ shr.u32 %r250, %r249, 25;
+ add.s32 %r251, %r9, %r250;
+ and.b32 %r252, %r251, -128;
+ sub.s32 %r253, %r9, %r252;
+ mul.wide.s32 %rd59, %r253, 4;
+ add.s64 %rd61, %rd19, 4096;
+ add.s64 %rd62, %rd61, %rd59;
+ mov.b32 %f160, %r209;
+ ld.shared.f32 %f161, [%rd62];
+ sub.f32 %f162, %f160, %f161;
+ mov.b32 %f163, %r201;
+ ld.shared.f32 %f164, [%rd62+512];
+ mul.f32 %f165, %f164, %f163;
+ sub.f32 %f166, %f162, %f165;
+ mul.wide.s32 %rd63, %r248, 4;
+ add.s64 %rd64, %rd19, %rd63;
+ ld.shared.f32 %f167, [%rd64];
+ cvt.rn.f32.f64 %f168, %fd1;
+ mul.f32 %f169, %f167, %f168;
+ mul.f32 %f170, %f169, %f166;
+ mov.b32 %r217, %f170;
+ mov.b32 %f171, %r210;
+ sub.f32 %f172, %f171, %f161;
+ mov.b32 %f173, %r202;
+ mul.f32 %f174, %f164, %f173;
sub.f32 %f175, %f172, %f174;
- mul.f32 %f176, %f162, %f175;
- mov.b32 %r213, %f176;
- mov.b32 %f177, %r206;
- sub.f32 %f178, %f177, %f164;
- mov.b32 %f179, %r198;
- mul.f32 %f180, %f167, %f179;
- sub.f32 %f181, %f178, %f180;
- mul.f32 %f182, %f162, %f181;
- mov.b32 %r214, %f182;
- mov.b32 %f183, %r207;
- sub.f32 %f184, %f183, %f164;
- mov.b32 %f185, %r199;
- mul.f32 %f186, %f167, %f185;
- sub.f32 %f187, %f184, %f186;
- mul.f32 %f188, %f162, %f187;
- mov.b32 %r215, %f188;
- ld.shared.f32 %f189, [%rd58+64];
- mul.f32 %f190, %f189, %f161;
- mov.b32 %f191, %r208;
- ld.shared.f32 %f192, [%rd58+4160];
- sub.f32 %f193, %f191, %f192;
- mov.b32 %f194, %r200;
- ld.shared.f32 %f195, [%rd58+4672];
- mul.f32 %f196, %f195, %f194;
- sub.f32 %f197, %f193, %f196;
- mul.f32 %f198, %f190, %f197;
- mov.b32 %r216, %f198;
- mov.b32 %f199, %r209;
- sub.f32 %f200, %f199, %f192;
- mov.b32 %f201, %r201;
- mul.f32 %f202, %f195, %f201;
- sub.f32 %f203, %f200, %f202;
- mul.f32 %f204, %f190, %f203;
- mov.b32 %r217, %f204;
- mov.b32 %f205, %r210;
- sub.f32 %f206, %f205, %f192;
- mov.b32 %f207, %r202;
- mul.f32 %f208, %f195, %f207;
+ ld.shared.f32 %f176, [%rd64+128];
+ mul.f32 %f177, %f176, %f168;
+ mul.f32 %f178, %f177, %f175;
+ mov.b32 %r218, %f178;
+ mov.b32 %f179, %r211;
+ sub.f32 %f180, %f179, %f161;
+ mov.b32 %f181, %r203;
+ mul.f32 %f182, %f164, %f181;
+ sub.f32 %f183, %f180, %f182;
+ ld.shared.f32 %f184, [%rd64+256];
+ mul.f32 %f185, %f184, %f168;
+ mul.f32 %f186, %f185, %f183;
+ mov.b32 %r219, %f186;
+ mov.b32 %f187, %r212;
+ sub.f32 %f188, %f187, %f161;
+ mov.b32 %f189, %r204;
+ mul.f32 %f190, %f164, %f189;
+ sub.f32 %f191, %f188, %f190;
+ ld.shared.f32 %f192, [%rd64+384];
+ mul.f32 %f193, %f192, %f168;
+ mul.f32 %f194, %f193, %f191;
+ mov.b32 %r220, %f194;
+ shr.s32 %r254, %r10, 31;
+ shr.u32 %r255, %r254, 25;
+ add.s32 %r256, %r10, %r255;
+ and.b32 %r257, %r256, -128;
+ sub.s32 %r258, %r10, %r257;
+ mul.wide.s32 %rd65, %r258, 4;
+ add.s64 %rd66, %rd61, %rd65;
+ mov.b32 %f195, %r213;
+ ld.shared.f32 %f196, [%rd66];
+ sub.f32 %f197, %f195, %f196;
+ mov.b32 %f198, %r205;
+ ld.shared.f32 %f199, [%rd66+512];
+ mul.f32 %f200, %f199, %f198;
+ sub.f32 %f201, %f197, %f200;
+ ld.shared.f32 %f202, [%rd64+64];
+ mul.f32 %f203, %f202, %f168;
+ mul.f32 %f204, %f203, %f201;
+ mov.b32 %r221, %f204;
+ mov.b32 %f205, %r214;
+ sub.f32 %f206, %f205, %f196;
+ mov.b32 %f207, %r206;
+ mul.f32 %f208, %f199, %f207;
sub.f32 %f209, %f206, %f208;
- mul.f32 %f210, %f190, %f209;
- mov.b32 %r218, %f210;
- mov.b32 %f211, %r211;
- sub.f32 %f212, %f211, %f192;
- mov.b32 %f213, %r203;
- mul.f32 %f214, %f195, %f213;
- sub.f32 %f215, %f212, %f214;
- mul.f32 %f216, %f190, %f215;
- mov.b32 %r219, %f216;
- mul.lo.s32 %r236, %r4, 896;
- add.s32 %r237, %r229, %r236;
- mad.lo.s32 %r238, %r237, %r65, %r10;
- add.s32 %r239, %r238, %r11;
- mul.wide.s32 %rd59, %r239, 4;
- add.s64 %rd44, %rd13, %rd59;
-
- st.global.cs.v4.s32 [%rd44], {%r212,%r213,%r214,%r215};
-
- add.s32 %r240, %r232, %r236;
- mad.lo.s32 %r241, %r240, %r65, %r10;
- add.s32 %r242, %r241, %r11;
- mul.wide.s32 %rd60, %r242, 4;
- add.s64 %rd45, %rd13, %rd60;
-
- st.global.cs.v4.s32 [%rd45], {%r216,%r217,%r218,%r219};
+ ld.shared.f32 %f210, [%rd64+192];
+ mul.f32 %f211, %f210, %f168;
+ mul.f32 %f212, %f211, %f209;
+ mov.b32 %r222, %f212;
+ mov.b32 %f213, %r215;
+ sub.f32 %f214, %f213, %f196;
+ mov.b32 %f215, %r207;
+ mul.f32 %f216, %f199, %f215;
+ sub.f32 %f217, %f214, %f216;
+ ld.shared.f32 %f218, [%rd64+320];
+ mul.f32 %f219, %f218, %f168;
+ mul.f32 %f220, %f219, %f217;
+ mov.b32 %r223, %f220;
+ mov.b32 %f221, %r216;
+ sub.f32 %f222, %f221, %f196;
+ mov.b32 %f223, %r208;
+ mul.f32 %f224, %f199, %f223;
+ sub.f32 %f225, %f222, %f224;
+ ld.shared.f32 %f226, [%rd64+448];
+ mul.f32 %f227, %f226, %f168;
+ mul.f32 %f228, %f227, %f225;
+ mov.b32 %r224, %f228;
+ shl.b32 %r259, %r4, 10;
+ add.s32 %r260, %r232, %r259;
+ mad.lo.s32 %r261, %r260, %r65, %r230;
+ add.s32 %r262, %r261, %r11;
+ mul.wide.s32 %rd67, %r262, 4;
+ add.s64 %rd49, %rd15, %rd67;
+
+ st.global.cs.v4.s32 [%rd49], {%r217,%r218,%r219,%r220};
+
+ add.s32 %r263, %r260, 16;
+ mad.lo.s32 %r264, %r263, %r65, %r230;
+ add.s32 %r265, %r264, %r11;
+ mul.wide.s32 %rd68, %r265, 4;
+ add.s64 %rd50, %rd15, %rd68;
+
+ st.global.cs.v4.s32 [%rd50], {%r221,%r222,%r223,%r224};
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f90, [%rd8];
st.shared.f32 [%rd6+4608], %f90;
$L__BB0_9:
- mov.u32 %r14, %ctaid.x;
- add.s32 %r100, %r65, 31;
- shr.s32 %r101, %r100, 31;
- shr.u32 %r102, %r101, 27;
- add.s32 %r103, %r100, %r102;
- shr.s32 %r15, %r103, 5;
- shl.b32 %r16, %r4, 5;
- shr.s32 %r104, %r2, 31;
- shr.u32 %r105, %r104, 29;
- add.s32 %r106, %r2, %r105;
- and.b32 %r107, %r106, 1073741816;
- sub.s32 %r108, %r2, %r107;
- shl.b32 %r109, %r108, 2;
- rem.s32 %r110, %r14, %r15;
- shl.b32 %r111, %r110, 5;
- add.s32 %r20, %r111, %r109;
+ mov.u32 %r13, %ctaid.x;
+ add.s32 %r94, %r65, 31;
+ shr.s32 %r95, %r94, 31;
+ shr.u32 %r96, %r95, 27;
+ add.s32 %r97, %r94, %r96;
+ shr.s32 %r14, %r97, 5;
+ shl.b32 %r15, %r4, 5;
+ shr.s32 %r98, %r2, 31;
+ shr.u32 %r99, %r98, 29;
+ add.s32 %r100, %r2, %r99;
+ and.b32 %r101, %r100, -8;
+ sub.s32 %r16, %r2, %r101;
+ shl.b32 %r102, %r16, 2;
+ rem.s32 %r103, %r13, %r14;
+ shl.b32 %r104, %r103, 5;
+ add.s32 %r20, %r104, %r102;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r65;
- shr.s32 %r18, %r106, 3;
+ shr.s32 %r18, %r100, 3;
add.s32 %r19, %r18, -216;
- mov.u32 %r243, 0;
- mov.u32 %r244, %r243;
- mov.u32 %r245, %r243;
- mov.u32 %r246, %r243;
+ mov.u32 %r266, 0;
+ mov.u32 %r267, %r266;
+ mov.u32 %r268, %r266;
+ mov.u32 %r269, %r266;
@%p8 bra $L__BB0_12;
- div.s32 %r116, %r14, %r15;
- shl.b32 %r21, %r116, 5;
- add.s32 %r117, %r19, %r21;
- neg.s32 %r118, %r16;
- setp.ge.s32 %p9, %r117, %r118;
+ div.s32 %r109, %r13, %r14;
+ shl.b32 %r21, %r109, 5;
+ add.s32 %r110, %r19, %r21;
+ neg.s32 %r111, %r15;
+ setp.ge.s32 %p9, %r110, %r111;
@%p9 bra $L__BB0_12;
- add.s32 %r123, %r16, %r18;
- add.s32 %r124, %r123, %r21;
- mad.lo.s32 %r125, %r124, %r65, %r20;
- mul.wide.s32 %rd21, %r125, 4;
- add.s64 %rd20, %rd4, %rd21;
-
- ld.global.cs.v4.u32 {%r246,%r245,%r244,%r243}, [%rd20];
+ add.s32 %r116, %r15, %r18;
+ add.s32 %r117, %r116, %r21;
+ mad.lo.s32 %r118, %r117, %r65, %r20;
+ mul.wide.s32 %rd23, %r118, 4;
+ add.s64 %rd22, %rd4, %rd23;
+
+ ld.global.cs.v4.u32 {%r269,%r268,%r267,%r266}, [%rd22];
$L__BB0_12:
- mov.f32 %f225, 0f00000000;
- mov.f32 %f226, 0f00000000;
- mov.f32 %f227, 0f00000000;
- mov.f32 %f228, 0f00000000;
+ mov.f32 %f237, 0f00000000;
+ mov.f32 %f238, 0f00000000;
+ mov.f32 %f239, 0f00000000;
+ mov.f32 %f240, 0f00000000;
@%p8 bra $L__BB0_15;
- div.s32 %r126, %r14, %r15;
- shl.b32 %r30, %r126, 5;
- add.s32 %r127, %r19, %r30;
- mov.u32 %r128, -16;
- sub.s32 %r129, %r128, %r16;
- setp.ge.s32 %p11, %r127, %r129;
+ div.s32 %r119, %r13, %r14;
+ shl.b32 %r30, %r119, 5;
+ add.s32 %r120, %r19, %r30;
+ mov.u32 %r121, -16;
+ sub.s32 %r122, %r121, %r15;
+ setp.ge.s32 %p11, %r120, %r122;
@%p11 bra $L__BB0_15;
- add.s32 %r134, %r16, %r18;
- add.s32 %r135, %r134, %r30;
- add.s32 %r136, %r135, 16;
- mad.lo.s32 %r137, %r136, %r65, %r20;
- mul.wide.s32 %rd23, %r137, 4;
- add.s64 %rd22, %rd4, %rd23;
-
- ld.global.cs.v4.u32 {%r130,%r131,%r132,%r133}, [%rd22];
-
- mov.b32 %f228, %r130;
- mov.b32 %f227, %r131;
- mov.b32 %f226, %r132;
- mov.b32 %f225, %r133;
+ add.s32 %r127, %r15, %r18;
+ add.s32 %r128, %r127, %r30;
+ add.s32 %r129, %r128, 16;
+ mad.lo.s32 %r130, %r129, %r65, %r20;
+ mul.wide.s32 %rd25, %r130, 4;
+ add.s64 %rd24, %rd4, %rd25;
+
+ ld.global.cs.v4.u32 {%r123,%r124,%r125,%r126}, [%rd24];
+
+ mov.b32 %f240, %r123;
+ mov.b32 %f239, %r124;
+ mov.b32 %f238, %r125;
+ mov.b32 %f237, %r126;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
- div.s32 %r138, %r14, %r15;
- shl.b32 %r139, %r138, 5;
- add.s32 %r32, %r139, %r2;
+ div.s32 %r131, %r13, %r14;
+ shl.b32 %r132, %r131, 5;
+ add.s32 %r32, %r132, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
- mul.wide.s32 %rd24, %r32, 4;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f32 %f99, [%rd25];
+ mul.wide.s32 %rd26, %r32, 4;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f32 %f99, [%rd27];
st.shared.f32 [%rd6+4096], %f99;
$L__BB0_18:
- mov.u32 %r247, 0;
- mov.u32 %r248, %r247;
- mov.u32 %r249, %r247;
- mov.u32 %r250, %r247;
+ mov.u32 %r270, 0;
+ mov.u32 %r271, %r270;
+ mov.u32 %r272, %r270;
+ mov.u32 %r273, %r270;
@%p8 bra $L__BB0_21;
- div.s32 %r148, %r14, %r15;
- shl.b32 %r33, %r148, 5;
- add.s32 %r149, %r19, %r33;
- neg.s32 %r150, %r31;
- setp.ge.s32 %p15, %r149, %r150;
+ div.s32 %r141, %r13, %r14;
+ shl.b32 %r33, %r141, 5;
+ add.s32 %r142, %r19, %r33;
+ neg.s32 %r143, %r31;
+ setp.ge.s32 %p15, %r142, %r143;
@%p15 bra $L__BB0_21;
- add.s32 %r155, %r31, %r18;
- add.s32 %r156, %r155, %r33;
- mad.lo.s32 %r157, %r156, %r65, %r20;
- mul.wide.s32 %rd27, %r157, 4;
- add.s64 %rd26, %rd3, %rd27;
-
- ld.global.cs.v4.u32 {%r250,%r249,%r248,%r247}, [%rd26];
+ add.s32 %r148, %r31, %r18;
+ add.s32 %r149, %r148, %r33;
+ mad.lo.s32 %r150, %r149, %r65, %r20;
+ mul.wide.s32 %rd29, %r150, 4;
+ add.s64 %rd28, %rd3, %rd29;
+
+ ld.global.cs.v4.u32 {%r273,%r272,%r271,%r270}, [%rd28];
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r65;
- mov.f32 %f229, 0f00000000;
- mov.f32 %f230, 0f00000000;
- mov.f32 %f231, 0f00000000;
- mov.f32 %f232, 0f00000000;
+ mov.f32 %f241, 0f00000000;
+ mov.f32 %f242, 0f00000000;
+ mov.f32 %f243, 0f00000000;
+ mov.f32 %f244, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
- div.s32 %r158, %r14, %r15;
- shl.b32 %r42, %r158, 5;
- add.s32 %r159, %r19, %r42;
- mov.u32 %r160, -16;
- sub.s32 %r161, %r160, %r31;
- setp.ge.s32 %p17, %r159, %r161;
+ div.s32 %r151, %r13, %r14;
+ shl.b32 %r42, %r151, 5;
+ add.s32 %r152, %r19, %r42;
+ mov.u32 %r153, -16;
+ sub.s32 %r154, %r153, %r31;
+ setp.ge.s32 %p17, %r152, %r154;
@%p17 bra $L__BB0_24;
- add.s32 %r166, %r31, %r18;
- add.s32 %r167, %r166, %r42;
- add.s32 %r168, %r167, 16;
- mad.lo.s32 %r169, %r168, %r65, %r20;
- mul.wide.s32 %rd29, %r169, 4;
- add.s64 %rd28, %rd3, %rd29;
-
- ld.global.cs.v4.u32 {%r162,%r163,%r164,%r165}, [%rd28];
-
- mov.b32 %f232, %r162;
- mov.b32 %f231, %r163;
- mov.b32 %f230, %r164;
- mov.b32 %f229, %r165;
+ add.s32 %r159, %r31, %r18;
+ add.s32 %r160, %r159, %r42;
+ add.s32 %r161, %r160, 16;
+ mad.lo.s32 %r162, %r161, %r65, %r20;
+ mul.wide.s32 %rd31, %r162, 4;
+ add.s64 %rd30, %rd3, %rd31;
+
+ ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd30];
+
+ mov.b32 %f244, %r155;
+ mov.b32 %f243, %r156;
+ mov.b32 %f242, %r157;
+ mov.b32 %f241, %r158;
$L__BB0_24:
- div.s32 %r170, %r14, %r15;
- shl.b32 %r43, %r170, 5;
- shr.u32 %r172, %r104, 27;
- add.s32 %r173, %r2, %r172;
- and.b32 %r174, %r173, -32;
- sub.s32 %r44, %r2, %r174;
- add.s32 %r175, %r43, %r44;
- setp.gt.s32 %p18, %r175, 215;
- mul.lo.s32 %r176, %r175, %r62;
- mul.wide.s32 %rd30, %r176, 4;
- add.s64 %rd9, %rd1, %rd30;
+ div.s32 %r163, %r13, %r14;
+ shl.b32 %r43, %r163, 5;
+ shr.u32 %r165, %r98, 27;
+ add.s32 %r166, %r2, %r165;
+ and.b32 %r167, %r166, -32;
+ sub.s32 %r168, %r2, %r167;
+ add.s32 %r169, %r43, %r168;
+ setp.gt.s32 %p18, %r169, 215;
+ mul.lo.s32 %r170, %r169, %r62;
+ mul.wide.s32 %rd32, %r170, 4;
+ add.s64 %rd9, %rd1, %rd32;
@%p18 bra $L__BB0_26;
- mul.wide.s32 %rd31, %r44, 4;
- add.s64 %rd33, %rd17, %rd31;
ld.global.f32 %f108, [%rd9];
- st.shared.f32 [%rd33], %f108;
+ st.shared.f32 [%rd6], %f108;
+ st.shared.f32 [%rd6+512], %f108;
+ st.shared.f32 [%rd6+1024], %f108;
+ st.shared.f32 [%rd6+1536], %f108;
+ st.shared.f32 [%rd6+2048], %f108;
+ st.shared.f32 [%rd6+2560], %f108;
+ st.shared.f32 [%rd6+3072], %f108;
+ st.shared.f32 [%rd6+3584], %f108;
$L__BB0_26:
- shl.b32 %r45, %r4, 9;
+ shl.b32 %r44, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
- neg.s32 %r46, %r45;
- add.s32 %r47, %r19, %r43;
- setp.ge.s32 %p19, %r47, %r46;
- mul.wide.s32 %rd34, %r18, 4;
- add.s64 %rd10, %rd17, %rd34;
- mov.f32 %f234, 0f00000000;
- mov.f32 %f233, %f234;
+ neg.s32 %r45, %r44;
+ add.s32 %r46, %r19, %r43;
+ setp.ge.s32 %p19, %r46, %r45;
+ shr.s32 %r171, %r18, 31;
+ shr.u32 %r172, %r171, 25;
+ add.s32 %r173, %r18, %r172;
+ and.b32 %r174, %r173, -128;
+ sub.s32 %r175, %r18, %r174;
+ mul.wide.s32 %rd33, %r175, 4;
+ add.s64 %rd35, %rd19, %rd33;
+ add.s64 %rd10, %rd35, 4096;
+ mov.f32 %f246, 0f00000000;
+ mov.f32 %f245, %f246;
@%p19 bra $L__BB0_28;
- ld.shared.f32 %f233, [%rd10+4096];
+ ld.shared.f32 %f245, [%rd10];
$L__BB0_28:
- mov.b32 %f111, %r250;
- sub.f32 %f20, %f111, %f233;
+ mov.b32 %f111, %r273;
+ sub.f32 %f20, %f111, %f245;
@%p19 bra $L__BB0_30;
- ld.shared.f32 %f234, [%rd10+4608];
+ ld.shared.f32 %f246, [%rd10+512];
$L__BB0_30:
- mov.b32 %f113, %r246;
- mul.f32 %f114, %f234, %f113;
+ mov.b32 %f113, %r269;
+ mul.f32 %f114, %f246, %f113;
sub.f32 %f23, %f20, %f114;
- mov.f32 %f236, 0f00000000;
- mov.f32 %f235, %f236;
+ shl.b32 %r176, %r16, 7;
+ add.s32 %r177, %r176, %r18;
+ mul.wide.s32 %rd36, %r177, 4;
+ add.s64 %rd11, %rd19, %rd36;
+ mov.f32 %f248, 0f00000000;
+ mov.f32 %f247, %f248;
@%p19 bra $L__BB0_32;
- ld.shared.f32 %f115, [%rd10];
- mul.f32 %f235, %f115, %f17;
+ ld.shared.f32 %f115, [%rd11];
+ mul.f32 %f247, %f115, %f17;
$L__BB0_32:
- mul.f32 %f26, %f23, %f235;
+ mul.f32 %f26, %f23, %f247;
@%p19 bra $L__BB0_34;
- ld.shared.f32 %f236, [%rd10+4096];
+ ld.shared.f32 %f248, [%rd10];
$L__BB0_34:
- mov.b32 %f118, %r249;
- sub.f32 %f29, %f118, %f236;
- mov.f32 %f238, 0f00000000;
- mov.f32 %f237, %f238;
+ mov.b32 %f118, %r272;
+ sub.f32 %f29, %f118, %f248;
+ mov.f32 %f250, 0f00000000;
+ mov.f32 %f249, %f250;
@%p19 bra $L__BB0_36;
- ld.shared.f32 %f237, [%rd10+4608];
+ ld.shared.f32 %f249, [%rd10+512];
$L__BB0_36:
- mov.b32 %f120, %r245;
- mul.f32 %f121, %f237, %f120;
+ mov.b32 %f120, %r268;
+ mul.f32 %f121, %f249, %f120;
sub.f32 %f32, %f29, %f121;
@%p19 bra $L__BB0_38;
- ld.shared.f32 %f122, [%rd10];
- mul.f32 %f238, %f122, %f17;
+ ld.shared.f32 %f122, [%rd11+128];
+ mul.f32 %f250, %f122, %f17;
$L__BB0_38:
- mul.f32 %f35, %f32, %f238;
- mov.f32 %f240, 0f00000000;
- mov.f32 %f239, %f240;
+ mul.f32 %f35, %f32, %f250;
+ mov.f32 %f252, 0f00000000;
+ mov.f32 %f251, %f252;
@%p19 bra $L__BB0_40;
- ld.shared.f32 %f239, [%rd10+4096];
+ ld.shared.f32 %f251, [%rd10];
$L__BB0_40:
- mov.b32 %f125, %r248;
- sub.f32 %f38, %f125, %f239;
+ mov.b32 %f125, %r271;
+ sub.f32 %f38, %f125, %f251;
@%p19 bra $L__BB0_42;
- ld.shared.f32 %f240, [%rd10+4608];
+ ld.shared.f32 %f252, [%rd10+512];
$L__BB0_42:
- mov.b32 %f127, %r244;
- mul.f32 %f128, %f240, %f127;
+ mov.b32 %f127, %r267;
+ mul.f32 %f128, %f252, %f127;
sub.f32 %f41, %f38, %f128;
- mov.f32 %f242, 0f00000000;
- mov.f32 %f241, %f242;
+ mov.f32 %f254, 0f00000000;
+ mov.f32 %f253, %f254;
@%p19 bra $L__BB0_44;
- ld.shared.f32 %f129, [%rd10];
- mul.f32 %f241, %f129, %f17;
+ ld.shared.f32 %f129, [%rd11+256];
+ mul.f32 %f253, %f129, %f17;
$L__BB0_44:
- mul.f32 %f44, %f41, %f241;
+ mul.f32 %f44, %f41, %f253;
@%p19 bra $L__BB0_46;
- ld.shared.f32 %f242, [%rd10+4096];
+ ld.shared.f32 %f254, [%rd10];
$L__BB0_46:
- mov.b32 %f132, %r247;
- sub.f32 %f47, %f132, %f242;
- mov.f32 %f244, 0f00000000;
- mov.f32 %f243, %f244;
+ mov.b32 %f132, %r270;
+ sub.f32 %f47, %f132, %f254;
+ mov.f32 %f256, 0f00000000;
+ mov.f32 %f255, %f256;
@%p19 bra $L__BB0_48;
- ld.shared.f32 %f243, [%rd10+4608];
+ ld.shared.f32 %f255, [%rd10+512];
$L__BB0_48:
- mov.b32 %f134, %r243;
- mul.f32 %f135, %f243, %f134;
+ mov.b32 %f134, %r266;
+ mul.f32 %f135, %f255, %f134;
sub.f32 %f50, %f47, %f135;
@%p19 bra $L__BB0_50;
- ld.shared.f32 %f136, [%rd10];
- mul.f32 %f244, %f136, %f17;
+ ld.shared.f32 %f136, [%rd11+384];
+ mul.f32 %f256, %f136, %f17;
$L__BB0_50:
- mul.f32 %f53, %f50, %f244;
- mov.u32 %r177, -16;
- sub.s32 %r48, %r177, %r45;
- setp.ge.s32 %p31, %r47, %r48;
- mov.f32 %f246, 0f00000000;
- mov.f32 %f245, %f246;
+ mul.f32 %f53, %f50, %f256;
+ mov.u32 %r178, -16;
+ sub.s32 %r47, %r178, %r44;
+ setp.ge.s32 %p31, %r46, %r47;
+ add.s32 %r48, %r18, 16;
+ shr.s32 %r179, %r48, 31;
+ shr.u32 %r180, %r179, 25;
+ add.s32 %r181, %r48, %r180;
+ and.b32 %r182, %r181, -128;
+ sub.s32 %r183, %r48, %r182;
+ mul.wide.s32 %rd38, %r183, 4;
+ add.s64 %rd40, %rd19, %rd38;
+ add.s64 %rd12, %rd40, 4096;
+ mov.f32 %f258, 0f00000000;
+ mov.f32 %f257, %f258;
@%p31 bra $L__BB0_52;
- ld.shared.f32 %f245, [%rd10+4160];
+ ld.shared.f32 %f257, [%rd12];
$L__BB0_52:
- sub.f32 %f56, %f232, %f245;
+ sub.f32 %f56, %f244, %f257;
@%p31 bra $L__BB0_54;
- ld.shared.f32 %f246, [%rd10+4672];
+ ld.shared.f32 %f258, [%rd12+512];
$L__BB0_54:
- mul.f32 %f140, %f246, %f228;
+ mul.f32 %f140, %f258, %f240;
sub.f32 %f59, %f56, %f140;
- mov.f32 %f248, 0f00000000;
- mov.f32 %f247, %f248;
+ mov.f32 %f260, 0f00000000;
+ mov.f32 %f259, %f260;
@%p31 bra $L__BB0_56;
- ld.shared.f32 %f141, [%rd10+64];
- mul.f32 %f247, %f141, %f17;
+ ld.shared.f32 %f141, [%rd11+64];
+ mul.f32 %f259, %f141, %f17;
$L__BB0_56:
- mul.f32 %f62, %f59, %f247;
+ mul.f32 %f62, %f59, %f259;
@%p31 bra $L__BB0_58;
- ld.shared.f32 %f248, [%rd10+4160];
+ ld.shared.f32 %f260, [%rd12];
$L__BB0_58:
- sub.f32 %f65, %f231, %f248;
- mov.f32 %f250, 0f00000000;
- mov.f32 %f249, %f250;
+ sub.f32 %f65, %f243, %f260;
+ mov.f32 %f262, 0f00000000;
+ mov.f32 %f261, %f262;
@%p31 bra $L__BB0_60;
- ld.shared.f32 %f249, [%rd10+4672];
+ ld.shared.f32 %f261, [%rd12+512];
$L__BB0_60:
- mul.f32 %f145, %f249, %f227;
+ mul.f32 %f145, %f261, %f239;
sub.f32 %f68, %f65, %f145;
@%p31 bra $L__BB0_62;
- ld.shared.f32 %f146, [%rd10+64];
- mul.f32 %f250, %f146, %f17;
+ ld.shared.f32 %f146, [%rd11+192];
+ mul.f32 %f262, %f146, %f17;
$L__BB0_62:
- mul.f32 %f71, %f68, %f250;
- mov.f32 %f252, 0f00000000;
- mov.f32 %f251, %f252;
+ mul.f32 %f71, %f68, %f262;
+ mov.f32 %f264, 0f00000000;
+ mov.f32 %f263, %f264;
@%p31 bra $L__BB0_64;
- ld.shared.f32 %f251, [%rd10+4160];
+ ld.shared.f32 %f263, [%rd12];
$L__BB0_64:
- sub.f32 %f74, %f230, %f251;
+ sub.f32 %f74, %f242, %f263;
@%p31 bra $L__BB0_66;
- ld.shared.f32 %f252, [%rd10+4672];
+ ld.shared.f32 %f264, [%rd12+512];
$L__BB0_66:
- mul.f32 %f150, %f252, %f226;
+ mul.f32 %f150, %f264, %f238;
sub.f32 %f77, %f74, %f150;
- mov.f32 %f254, 0f00000000;
- mov.f32 %f253, %f254;
+ mov.f32 %f266, 0f00000000;
+ mov.f32 %f265, %f266;
@%p31 bra $L__BB0_68;
- ld.shared.f32 %f151, [%rd10+64];
- mul.f32 %f253, %f151, %f17;
+ ld.shared.f32 %f151, [%rd11+320];
+ mul.f32 %f265, %f151, %f17;
$L__BB0_68:
- mul.f32 %f80, %f77, %f253;
+ mul.f32 %f80, %f77, %f265;
@%p31 bra $L__BB0_70;
- ld.shared.f32 %f254, [%rd10+4160];
+ ld.shared.f32 %f266, [%rd12];
$L__BB0_70:
- sub.f32 %f83, %f229, %f254;
- mov.f32 %f256, 0f00000000;
- mov.f32 %f255, %f256;
+ sub.f32 %f83, %f241, %f266;
+ mov.f32 %f268, 0f00000000;
+ mov.f32 %f267, %f268;
@%p31 bra $L__BB0_72;
- ld.shared.f32 %f255, [%rd10+4672];
+ ld.shared.f32 %f267, [%rd12+512];
$L__BB0_72:
- mul.f32 %f155, %f255, %f225;
+ mul.f32 %f155, %f267, %f237;
sub.f32 %f86, %f83, %f155;
@%p31 bra $L__BB0_74;
- ld.shared.f32 %f156, [%rd10+64];
- mul.f32 %f256, %f156, %f17;
+ ld.shared.f32 %f156, [%rd11+448];
+ mul.f32 %f268, %f156, %f17;
$L__BB0_74:
- mul.f32 %f89, %f86, %f256;
+ mul.f32 %f89, %f86, %f268;
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
- neg.s32 %r178, %r49;
- setp.ge.s32 %p44, %r47, %r178;
+ neg.s32 %r184, %r49;
+ setp.ge.s32 %p44, %r46, %r184;
@%p44 bra $L__BB0_77;
- add.s32 %r183, %r49, %r18;
- add.s32 %r184, %r183, %r43;
- mad.lo.s32 %r185, %r184, %r65, %r20;
- mul.wide.s32 %rd37, %r185, 4;
- add.s64 %rd36, %rd13, %rd37;
- mov.b32 %r182, %f53;
- mov.b32 %r180, %f35;
- mov.b32 %r179, %f26;
- mov.b32 %r181, %f44;
-
- st.global.cs.v4.s32 [%rd36], {%r179,%r180,%r181,%r182};
+ add.s32 %r189, %r49, %r18;
+ add.s32 %r190, %r189, %r43;
+ mad.lo.s32 %r191, %r190, %r65, %r20;
+ mul.wide.s32 %rd42, %r191, 4;
+ add.s64 %rd41, %rd15, %rd42;
+ mov.b32 %r188, %f53;
+ mov.b32 %r186, %f35;
+ mov.b32 %r185, %f26;
+ mov.b32 %r187, %f44;
+
+ st.global.cs.v4.s32 [%rd41], {%r185,%r186,%r187,%r188};
$L__BB0_77:
- mov.u32 %r186, -16;
- sub.s32 %r187, %r186, %r49;
- setp.ge.s32 %p46, %r47, %r187;
+ mov.u32 %r192, -16;
+ sub.s32 %r193, %r192, %r49;
+ setp.ge.s32 %p46, %r46, %r193;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
- add.s32 %r192, %r49, %r18;
- add.s32 %r193, %r192, %r43;
- add.s32 %r194, %r193, 16;
- mad.lo.s32 %r195, %r194, %r65, %r20;
- mul.wide.s32 %rd39, %r195, 4;
- add.s64 %rd38, %rd13, %rd39;
- mov.b32 %r191, %f89;
- mov.b32 %r189, %f71;
- mov.b32 %r188, %f62;
- mov.b32 %r190, %f80;
-
- st.global.cs.v4.s32 [%rd38], {%r188,%r189,%r190,%r191};
+ add.s32 %r198, %r48, %r49;
+ add.s32 %r199, %r198, %r43;
+ mad.lo.s32 %r200, %r199, %r65, %r20;
+ mul.wide.s32 %rd44, %r200, 4;
+ add.s64 %rd43, %rd15, %rd44;
+ mov.b32 %r197, %f89;
+ mov.b32 %r195, %f71;
+ mov.b32 %r194, %f62;
+ mov.b32 %r196, %f80;
+
+ st.global.cs.v4.s32 [%rd43], {%r194,%r195,%r196,%r197};
$L__BB0_80:
ret;
13: CombinedSchedulerTest.LayerNormBackward/dtype___half_batch_216_hidden_32
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 64
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<631>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
shl.b32 %r248, %r4, 2;
max.s32 %r249, %r2, %r3;
mad.lo.s32 %r250, %r248, %r249, 15;
and.b32 %r251, %r250, -16;
cvt.u64.u32 %rd2, %r251;
mov.u64 %rd43, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r252, %r8, 7;
setp.lt.s32 %p11, %r252, %r202;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
// end inline asm
shl.b32 %r256, %r5, 4;
add.s32 %r254, %r253, %r256;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r255, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r255, 0;
cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r583, %r6, 4;
add.s32 %r257, %r4, 215;
div.s32 %r258, %r257, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r259, %r11, %r258;
add.s32 %r260, %r259, -1;
div.s32 %r12, %r260, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r262, %ctaid.y;
mul.lo.s32 %r263, %r12, %r4;
mul.lo.s32 %r13, %r263, %r262;
shl.b32 %r264, %r9, 1;
mov.u32 %r265, 1;
shl.b32 %r266, %r5, 4;
mad.lo.s32 %r14, %r264, %r202, %r266;
mul.lo.s32 %r267, %r202, %r9;
cvt.s64.s32 %rd52, %r267;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r268, %r13, %r202;
cvt.s64.s32 %rd6, %r268;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r269, %tid.z;
mad.lo.s32 %r270, %r4, %r269, %r9;
mad.lo.s32 %r15, %r270, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r271, %r3;
mov.u32 %r272, 31;
sub.s32 %r273, %r272, %r271;
shl.b32 %r16, %r265, %r273;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r274, %r16, %r5;
setp.lt.u32 %p18, %r274, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r275, %r15, %r16;
mul.wide.s32 %rd55, %r275, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r276, %r16, 31;
add.s32 %r277, %r16, %r276;
shr.s32 %r17, %r277, 1;
add.s32 %r18, %r267, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r18, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r270, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r580, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r14, %r281;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r14, %r284;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r580, %r4;
add.s32 %r279, %r23, %r9;
add.s32 %r24, %r279, %r13;
setp.gt.s32 %p19, %r24, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r24, %r211;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r24, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r578, %r580, %r4;
mul.lo.s32 %r287, %r578, %r202;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r576, %r580, %r4;
add.s32 %r575, %r576, %r9;
add.s32 %r574, %r575, %r13;
setp.gt.s32 %p204, %r574, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r24, %r215;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ cvt.f32.f16 %f221, %rs36;}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ cvt.f32.f16 %f222, %rs37;}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ cvt.f32.f16 %f223, %rs38;}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f224, %rs39;}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ cvt.f32.f16 %f225, %rs40;}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ cvt.f32.f16 %f226, %rs41;}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ cvt.f32.f16 %f227, %rs42;}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ cvt.f32.f16 %f228, %rs43;}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ cvt.f32.f16 %f229, %rs44;}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ cvt.f32.f16 %f230, %rs45;}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ cvt.f32.f16 %f231, %rs46;}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ cvt.f32.f16 %f232, %rs47;}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ cvt.f32.f16 %f233, %rs48;}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ cvt.f32.f16 %f234, %rs49;}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ cvt.f32.f16 %f235, %rs50;}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ cvt.f32.f16 %f236, %rs51;}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ cvt.f32.f16 %f237, %rs52;}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ cvt.f32.f16 %f238, %rs53;}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ cvt.f32.f16 %f239, %rs54;}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ cvt.f32.f16 %f240, %rs55;}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ cvt.f32.f16 %f241, %rs56;}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ cvt.f32.f16 %f242, %rs57;}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ cvt.f32.f16 %f243, %rs58;}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ cvt.f32.f16 %f244, %rs59;}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r583, %r583, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r581, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r581;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r581, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r36, %r581, 1;
setp.gt.u32 %p27, %r581, 3;
mov.u32 %r581, %r36;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r582, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r582;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r582, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r38, %r582, 1;
setp.gt.u32 %p33, %r582, 3;
mov.u32 %r582, %r38;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ cvt.f32.f16 %f338, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ cvt.f32.f16 %f339, %rs98;}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ cvt.f32.f16 %f340, %rs99;}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ cvt.f32.f16 %f342, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f343, %rs102;}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ cvt.f32.f16 %f344, %rs103;}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ cvt.f32.f16 %f346, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ cvt.f32.f16 %f347, %rs106;}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ cvt.f32.f16 %f348, %rs107;}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ cvt.f32.f16 %f350, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f351, %rs110;}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ cvt.f32.f16 %f352, %rs111;}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ cvt.f32.f16 %f354, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ cvt.f32.f16 %f355, %rs114;}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ cvt.f32.f16 %f356, %rs115;}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ cvt.f32.f16 %f358, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f359, %rs118;}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ cvt.f32.f16 %f360, %rs119;}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ cvt.f32.f16 %f362, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ cvt.f32.f16 %f363, %rs122;}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ cvt.f32.f16 %f364, %rs123;}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ cvt.f32.f16 %f366, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f367, %rs126;}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ cvt.f32.f16 %f368, %rs127;}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
add.s32 %r351, %r13, %r577;
mad.lo.s32 %r352, %r351, %r202, %r18;
mul.wide.s32 %rd82, %r352, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r580, %r580, 1;
setp.lt.s32 %p39, %r580, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r353, %tid.z;
mad.lo.s32 %r354, %r4, %r353, %r9;
mad.lo.s32 %r50, %r354, %r3, %r5;
mul.wide.u32 %rd83, %r50, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r355, %r4;
mov.u32 %r356, 31;
sub.s32 %r51, %r356, %r355;
mov.u32 %r357, 1;
shl.b32 %r614, %r357, %r51;
setp.lt.u32 %p40, %r9, %r614;
add.s32 %r358, %r614, %r9;
setp.lt.u32 %p41, %r358, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r359, %r3, %r51;
add.s32 %r360, %r50, %r359;
mul.wide.s32 %rd85, %r360, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r614, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r584, %r614;
$L__BB0_48:
shr.u32 %r54, %r584, 1;
setp.ge.u32 %p44, %r9, %r54;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r361, %r54, %r3, %r50;
mul.wide.s32 %rd88, %r361, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r584, 7;
mov.u32 %r584, %r54;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r585, 0;
add.s32 %r363, %r50, %r3;
mul.wide.u32 %rd91, %r363, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r585, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r364, %r3, %r51;
add.s32 %r365, %r50, %r364;
mul.wide.s32 %rd93, %r365, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r586, %r614;
$L__BB0_59:
shr.u32 %r58, %r586, 1;
setp.ge.u32 %p50, %r9, %r58;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r366, %r58, %r3, %r50;
mul.wide.s32 %rd96, %r366, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r586, 7;
mov.u32 %r586, %r58;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r587, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r368, %r3, %r51;
add.s32 %r369, %r50, %r368;
mul.wide.s32 %rd99, %r369, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r588, %r614;
$L__BB0_70:
shr.u32 %r62, %r588, 1;
setp.ge.u32 %p56, %r9, %r62;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r370, %r62, %r3, %r50;
mul.wide.s32 %rd102, %r370, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r588, 7;
mov.u32 %r588, %r62;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r589, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r372, %r3, %r51;
add.s32 %r373, %r50, %r372;
mul.wide.s32 %rd105, %r373, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r590, %r614;
$L__BB0_81:
shr.u32 %r66, %r590, 1;
setp.ge.u32 %p62, %r9, %r66;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r374, %r66, %r3, %r50;
mul.wide.s32 %rd108, %r374, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r590, 7;
mov.u32 %r590, %r66;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r591, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r376, %r3, %r51;
add.s32 %r377, %r50, %r376;
mul.wide.s32 %rd111, %r377, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r592, %r614;
$L__BB0_92:
shr.u32 %r70, %r592, 1;
setp.ge.u32 %p68, %r9, %r70;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r378, %r70, %r3, %r50;
mul.wide.s32 %rd114, %r378, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r592, 7;
mov.u32 %r592, %r70;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r593, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r380, %r3, %r51;
add.s32 %r381, %r50, %r380;
mul.wide.s32 %rd117, %r381, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r594, %r614;
$L__BB0_103:
shr.u32 %r74, %r594, 1;
setp.ge.u32 %p74, %r9, %r74;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r382, %r74, %r3, %r50;
mul.wide.s32 %rd120, %r382, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r594, 7;
mov.u32 %r594, %r74;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r595, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r384, %r3, %r51;
add.s32 %r385, %r50, %r384;
mul.wide.s32 %rd123, %r385, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r596, %r614;
$L__BB0_114:
shr.u32 %r78, %r596, 1;
setp.ge.u32 %p80, %r9, %r78;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r386, %r78, %r3, %r50;
mul.wide.s32 %rd126, %r386, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r596, 7;
mov.u32 %r596, %r78;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r597, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r388, %r3, %r51;
add.s32 %r389, %r50, %r388;
mul.wide.s32 %rd129, %r389, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r598, %r614;
$L__BB0_125:
shr.u32 %r82, %r598, 1;
setp.ge.u32 %p86, %r9, %r82;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r390, %r82, %r3, %r50;
mul.wide.s32 %rd132, %r390, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r598, 7;
mov.u32 %r598, %r82;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r599, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r85, %r583, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r392, %r3, %r51;
add.s32 %r393, %r50, %r392;
mul.wide.s32 %rd135, %r393, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r600, %r614;
$L__BB0_136:
shr.u32 %r87, %r600, 1;
setp.ge.u32 %p92, %r9, %r87;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r394, %r87, %r3, %r50;
mul.wide.s32 %rd138, %r394, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r600, 7;
mov.u32 %r600, %r87;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r601, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r396, %r3, %r51;
add.s32 %r397, %r50, %r396;
mul.wide.s32 %rd141, %r397, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r602, %r614;
$L__BB0_147:
shr.u32 %r91, %r602, 1;
setp.ge.u32 %p98, %r9, %r91;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r398, %r91, %r3, %r50;
mul.wide.s32 %rd144, %r398, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r602, 7;
mov.u32 %r602, %r91;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r603, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r400, %r3, %r51;
add.s32 %r401, %r50, %r400;
mul.wide.s32 %rd147, %r401, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r604, %r614;
$L__BB0_158:
shr.u32 %r95, %r604, 1;
setp.ge.u32 %p104, %r9, %r95;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r402, %r95, %r3, %r50;
mul.wide.s32 %rd150, %r402, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r604, 7;
mov.u32 %r604, %r95;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r605, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r404, %r3, %r51;
add.s32 %r405, %r50, %r404;
mul.wide.s32 %rd153, %r405, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r606, %r614;
$L__BB0_169:
shr.u32 %r99, %r606, 1;
setp.ge.u32 %p110, %r9, %r99;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r406, %r99, %r3, %r50;
mul.wide.s32 %rd156, %r406, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r606, 7;
mov.u32 %r606, %r99;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r607, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r408, %r3, %r51;
add.s32 %r409, %r50, %r408;
mul.wide.s32 %rd159, %r409, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r608, %r614;
$L__BB0_180:
shr.u32 %r103, %r608, 1;
setp.ge.u32 %p116, %r9, %r103;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r410, %r103, %r3, %r50;
mul.wide.s32 %rd162, %r410, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r608, 7;
mov.u32 %r608, %r103;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r609, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r412, %r3, %r51;
add.s32 %r413, %r50, %r412;
mul.wide.s32 %rd165, %r413, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r610, %r614;
$L__BB0_191:
shr.u32 %r107, %r610, 1;
setp.ge.u32 %p122, %r9, %r107;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r414, %r107, %r3, %r50;
mul.wide.s32 %rd168, %r414, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r610, 7;
mov.u32 %r610, %r107;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r611, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r611, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r416, %r3, %r51;
add.s32 %r417, %r50, %r416;
mul.wide.s32 %rd171, %r417, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r612, %r614;
$L__BB0_202:
shr.u32 %r111, %r612, 1;
setp.ge.u32 %p128, %r9, %r111;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r418, %r111, %r3, %r50;
mul.wide.s32 %rd174, %r418, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r612, 7;
mov.u32 %r612, %r111;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r613, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r613, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r420, %r3, %r51;
add.s32 %r421, %r50, %r420;
mul.wide.s32 %rd177, %r421, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r115, %r614, 1;
setp.ge.u32 %p134, %r9, %r115;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r422, %r115, %r3, %r50;
mul.wide.s32 %rd180, %r422, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r614, 7;
mov.u32 %r614, %r115;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r615, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r615, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
shl.b32 %r573, %r5, 3;
mov.u32 %r448, %ctaid.y;
mad.lo.s32 %r449, %r202, %r448, %r573;
add.s32 %r450, %r449, %r85;
mul.wide.s32 %rd189, %r450, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
// end inline asm
add.s32 %r451, %r450, 4;
mul.wide.s32 %rd190, %r451, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r424, %r570, 3;
sub.s32 %r118, %r424, %r202;
mov.u32 %r425, %ctaid.y;
mad.lo.s32 %r119, %r202, %r425, %r570;
neg.s32 %r426, %r85;
setp.ge.s32 %p141, %r118, %r426;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r431, %r119, %r85;
mul.wide.s32 %rd184, %r431, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
// end inline asm
$L__BB0_222:
mov.u32 %r432, -4;
sub.s32 %r433, %r432, %r85;
setp.ge.s32 %p143, %r118, %r433;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r438, %r119, %r85;
add.s32 %r439, %r438, 4;
mul.wide.s32 %rd186, %r439, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
// end inline asm
$L__BB0_226:
shl.b32 %r120, %r583, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
shl.b32 %r572, %r5, 3;
mov.u32 %r476, %ctaid.y;
mad.lo.s32 %r477, %r202, %r476, %r572;
add.s32 %r478, %r477, %r120;
mul.wide.s32 %rd197, %r478, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
// end inline asm
add.s32 %r479, %r478, 4;
mul.wide.s32 %rd198, %r479, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r452, %r571, 3;
sub.s32 %r121, %r452, %r202;
mov.u32 %r453, %ctaid.y;
mad.lo.s32 %r122, %r202, %r453, %r571;
neg.s32 %r454, %r120;
setp.ge.s32 %p150, %r121, %r454;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r459, %r122, %r120;
mul.wide.s32 %rd192, %r459, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
// end inline asm
$L__BB0_229:
mov.u32 %r460, -4;
sub.s32 %r461, %r460, %r120;
setp.ge.s32 %p152, %r121, %r461;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r466, %r122, %r120;
add.s32 %r467, %r466, 4;
mul.wide.s32 %rd194, %r467, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
// end inline asm
$L__BB0_233:
mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r480, %r5, %r9;
or.b32 %r482, %r480, %r353;
setp.ne.s32 %p156, %r482, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r483, %ctaid.x;
mov.u32 %r484, %ctaid.z;
mov.u32 %r485, %nctaid.x;
mad.lo.s32 %r486, %r484, %r485, %r483;
mul.wide.s32 %rd200, %r486, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r487, %r11, -1;
setp.eq.s32 %p157, %r123, %r487;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r616, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r616;
// end inline asm
setp.lt.u32 %p159, %r616, 256;
selp.u32 %r490, 1, 0, %p159;
shl.b32 %r616, %r616, %r490;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r491, %r11, %r3;
add.s32 %r492, %r491, -1;
div.s32 %r126, %r492, %r3;
setp.lt.s32 %p161, %r126, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r494, %r202, 1;
shr.u32 %r495, %r494, 31;
add.s32 %r496, %r494, %r495;
shr.s32 %r497, %r496, 1;
add.s32 %r498, %r4, %r497;
add.s32 %r499, %r498, -1;
shl.b32 %r500, %r9, 1;
shl.b32 %r501, %r4, 1;
mad.lo.s32 %r502, %r501, %r123, %r500;
or.b32 %r503, %r502, 1;
setp.ge.s32 %p162, %r503, %r202;
div.s32 %r504, %r499, %r4;
setp.ge.s32 %p163, %r123, %r504;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r505, %r4, %r123;
shl.b32 %r506, %r505, 1;
mad.lo.s32 %r507, %r202, %r5, %r506;
add.s32 %r618, %r507, %r500;
mul.lo.s32 %r128, %r202, %r3;
mov.u32 %r493, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r617, %r5;
mov.u32 %r619, %r493;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r617, %r11;
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r618, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r621;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r620;
add.f32 %f678, %f678, %f559;
add.s32 %r618, %r618, %r128;
add.s32 %r617, %r617, %r3;
add.s32 %r619, %r619, 1;
setp.lt.s32 %p165, %r619, %r126;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r514, %r3;
mov.u32 %r515, 31;
sub.s32 %r516, %r515, %r514;
mov.u32 %r517, 1;
shl.b32 %r139, %r517, %r516;
setp.lt.u32 %p166, %r5, %r139;
add.s32 %r518, %r139, %r5;
setp.lt.u32 %p167, %r518, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r519, %r50, %r139;
mul.wide.s32 %rd211, %r519, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r520, %r139, 31;
add.s32 %r521, %r139, %r520;
shr.s32 %r630, %r521, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r139, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r622, %r630;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r622;
@%p170 bra $L__BB0_249;
add.s32 %r522, %r622, %r50;
mul.wide.s32 %rd213, %r522, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r142, %r622, 1;
setp.gt.u32 %p171, %r622, 3;
mov.u32 %r622, %r142;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r523, %r50, 1;
mul.wide.u32 %rd216, %r523, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r623, %r630;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r623;
@%p176 bra $L__BB0_259;
add.s32 %r524, %r623, %r50;
mul.wide.s32 %rd218, %r524, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r144, %r623, 1;
setp.gt.u32 %p177, %r623, 3;
mov.u32 %r623, %r144;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r525, %r202, 1;
shr.u32 %r526, %r525, 31;
add.s32 %r527, %r525, %r526;
shr.s32 %r528, %r527, 1;
add.s32 %r529, %r4, %r528;
add.s32 %r530, %r529, -1;
div.s32 %r531, %r530, %r4;
setp.ge.s32 %p181, %r123, %r531;
@%p181 bra $L__BB0_267;
shl.b32 %r145, %r9, 1;
mul.lo.s32 %r532, %r4, %r123;
shl.b32 %r146, %r532, 1;
add.s32 %r533, %r145, %r146;
or.b32 %r534, %r533, 1;
setp.ge.s32 %p182, %r534, %r202;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r535, %r146, %r145;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r535, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r537, %r202, 1;
shr.u32 %r538, %r537, 31;
add.s32 %r539, %r537, %r538;
shr.s32 %r540, %r539, 1;
add.s32 %r541, %r4, %r540;
add.s32 %r542, %r541, -1;
shl.b32 %r543, %r9, 1;
shl.b32 %r544, %r4, 1;
mad.lo.s32 %r545, %r544, %r123, %r543;
or.b32 %r546, %r545, 1;
setp.ge.s32 %p184, %r546, %r202;
div.s32 %r547, %r542, %r4;
setp.ge.s32 %p185, %r123, %r547;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r548, %r4, %r123;
shl.b32 %r549, %r548, 1;
mad.lo.s32 %r550, %r202, %r5, %r549;
add.s32 %r625, %r550, %r543;
mul.lo.s32 %r148, %r202, %r3;
mov.u32 %r536, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r624, %r5;
mov.u32 %r626, %r536;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r624, %r11;
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r625, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r628;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r627;
add.f32 %f684, %f684, %f585;
add.s32 %r625, %r625, %r148;
add.s32 %r624, %r624, %r3;
add.s32 %r626, %r626, 1;
setp.lt.s32 %p187, %r626, %r126;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r629, %r630;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r629;
@%p190 bra $L__BB0_279;
add.s32 %r557, %r629, %r50;
mul.wide.s32 %rd226, %r557, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r160, %r629, 1;
setp.gt.u32 %p191, %r629, 3;
mov.u32 %r629, %r160;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r630;
@%p196 bra $L__BB0_288;
add.s32 %r558, %r630, %r50;
mul.wide.s32 %rd229, %r558, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r162, %r630, 1;
setp.gt.u32 %p197, %r630, 3;
mov.u32 %r630, %r162;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r559, %r202, 1;
shr.u32 %r560, %r559, 31;
add.s32 %r561, %r559, %r560;
shr.s32 %r562, %r561, 1;
add.s32 %r563, %r4, %r562;
add.s32 %r564, %r563, -1;
div.s32 %r565, %r564, %r4;
setp.ge.s32 %p201, %r123, %r565;
@%p201 bra $L__BB0_296;
shl.b32 %r163, %r9, 1;
mul.lo.s32 %r566, %r4, %r123;
shl.b32 %r164, %r566, 1;
add.s32 %r567, %r163, %r164;
or.b32 %r568, %r567, 1;
setp.ge.s32 %p202, %r568, %r202;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_6130d4d1_1033910nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r569, %r164, %r163;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r569, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r200, %r201}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r210, %r211}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r214, %r215}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r236, %r201, 7;
shr.s32 %r237, %r236, 31;
shr.u32 %r238, %r237, 29;
add.s32 %r239, %r236, %r238;
shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r240, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r242, %r4, %r2;
shl.b32 %r243, %r242, 4;
or.b32 %r244, %r243, 15;
and.b32 %r7, %r244, -16;
add.s32 %r245, %r244, %r7;
and.b32 %r246, %r245, -16;
cvt.s64.s32 %rd1, %r246;
shl.b32 %r247, %r4, 2;
max.s32 %r248, %r2, %r3;
mad.lo.s32 %r249, %r247, %r248, 15;
and.b32 %r250, %r249, -16;
cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r251, %r8, 7;
setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
// end inline asm
shl.b32 %r255, %r5, 4;
add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r254, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r254, 0;
cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r577, %r6, 4;
add.s32 %r256, %r4, 215;
div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r258, %r11, %r257;
add.s32 %r259, %r258, -1;
div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r261, %ctaid.y;
mul.lo.s32 %r262, %r12, %r4;
mul.lo.s32 %r13, %r262, %r261;
mad.lo.s32 %r263, %r2, %r9, %r5;
shl.b32 %r14, %r263, 4;
mul.lo.s32 %r264, %r201, %r9;
cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r265, %r13, %r201;
cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r266, %tid.z;
mad.lo.s32 %r267, %r4, %r266, %r9;
mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r268, %r3;
mov.u32 %r269, 31;
sub.s32 %r270, %r269, %r268;
mov.u32 %r271, 1;
shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r272, %r16, %r5;
setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r273, %r15, %r16;
mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r274, %r16, 31;
add.s32 %r275, %r16, %r274;
shr.s32 %r17, %r275, 1;
shl.b32 %r276, %r9, 3;
mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r281, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r574, %r4;
add.s32 %r279, %r22, %r9;
add.s32 %r23, %r279, %r13;
setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r572, %r574, %r4;
mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r571, %r574, %r4;
add.s32 %r570, %r571, %r9;
add.s32 %r569, %r570, %r13;
setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ cvt.f32.f16 %f221, %rs36;}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ cvt.f32.f16 %f222, %rs37;}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ cvt.f32.f16 %f223, %rs38;}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f224, %rs39;}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ cvt.f32.f16 %f225, %rs40;}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ cvt.f32.f16 %f226, %rs41;}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ cvt.f32.f16 %f227, %rs42;}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ cvt.f32.f16 %f228, %rs43;}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ cvt.f32.f16 %f229, %rs44;}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ cvt.f32.f16 %f230, %rs45;}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ cvt.f32.f16 %f231, %rs46;}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ cvt.f32.f16 %f232, %rs47;}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ cvt.f32.f16 %f233, %rs48;}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ cvt.f32.f16 %f234, %rs49;}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ cvt.f32.f16 %f235, %rs50;}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ cvt.f32.f16 %f236, %rs51;}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ cvt.f32.f16 %f237, %rs52;}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ cvt.f32.f16 %f238, %rs53;}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ cvt.f32.f16 %f239, %rs54;}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ cvt.f32.f16 %f240, %rs55;}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ cvt.f32.f16 %f241, %rs56;}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ cvt.f32.f16 %f242, %rs57;}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ cvt.f32.f16 %f243, %rs58;}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ cvt.f32.f16 %f244, %rs59;}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r575, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r35, %r575, 1;
setp.gt.u32 %p27, %r575, 3;
mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r576, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r37, %r576, 1;
setp.gt.u32 %p33, %r576, 3;
mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ cvt.f32.f16 %f338, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ cvt.f32.f16 %f339, %rs98;}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ cvt.f32.f16 %f340, %rs99;}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ cvt.f32.f16 %f342, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f343, %rs102;}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ cvt.f32.f16 %f344, %rs103;}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ cvt.f32.f16 %f346, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ cvt.f32.f16 %f347, %rs106;}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ cvt.f32.f16 %f348, %rs107;}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ cvt.f32.f16 %f350, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f351, %rs110;}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ cvt.f32.f16 %f352, %rs111;}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ cvt.f32.f16 %f354, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ cvt.f32.f16 %f355, %rs114;}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ cvt.f32.f16 %f356, %rs115;}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ cvt.f32.f16 %f358, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f359, %rs118;}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ cvt.f32.f16 %f360, %rs119;}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ cvt.f32.f16 %f362, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ cvt.f32.f16 %f363, %rs122;}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ cvt.f32.f16 %f364, %rs123;}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ cvt.f32.f16 %f366, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f367, %rs126;}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ cvt.f32.f16 %f368, %rs127;}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
mad.lo.s32 %r351, %r23, %r201, %r8;
mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r574, %r574, 1;
setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r352, %tid.z;
mad.lo.s32 %r353, %r4, %r352, %r9;
mad.lo.s32 %r49, %r353, %r3, %r5;
mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r354, %r4;
mov.u32 %r355, 31;
sub.s32 %r50, %r355, %r354;
mov.u32 %r356, 1;
shl.b32 %r608, %r356, %r50;
setp.lt.u32 %p40, %r9, %r608;
add.s32 %r357, %r608, %r9;
setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r358, %r3, %r50;
add.s32 %r359, %r49, %r358;
mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r578, %r608;
$L__BB0_48:
shr.u32 %r53, %r578, 1;
setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r360, %r53, %r3, %r49;
mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r578, 7;
mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r579, 0;
add.s32 %r362, %r49, %r3;
mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r363, %r3, %r50;
add.s32 %r364, %r49, %r363;
mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r580, %r608;
$L__BB0_59:
shr.u32 %r57, %r580, 1;
setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r365, %r57, %r3, %r49;
mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r580, 7;
mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r367, %r3, %r50;
add.s32 %r368, %r49, %r367;
mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r582, %r608;
$L__BB0_70:
shr.u32 %r61, %r582, 1;
setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r369, %r61, %r3, %r49;
mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r582, 7;
mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r371, %r3, %r50;
add.s32 %r372, %r49, %r371;
mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r584, %r608;
$L__BB0_81:
shr.u32 %r65, %r584, 1;
setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r373, %r65, %r3, %r49;
mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r584, 7;
mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r375, %r3, %r50;
add.s32 %r376, %r49, %r375;
mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r586, %r608;
$L__BB0_92:
shr.u32 %r69, %r586, 1;
setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r377, %r69, %r3, %r49;
mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r586, 7;
mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r379, %r3, %r50;
add.s32 %r380, %r49, %r379;
mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r588, %r608;
$L__BB0_103:
shr.u32 %r73, %r588, 1;
setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r381, %r73, %r3, %r49;
mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r588, 7;
mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r383, %r3, %r50;
add.s32 %r384, %r49, %r383;
mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r590, %r608;
$L__BB0_114:
shr.u32 %r77, %r590, 1;
setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r385, %r77, %r3, %r49;
mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r590, 7;
mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r387, %r3, %r50;
add.s32 %r388, %r49, %r387;
mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r592, %r608;
$L__BB0_125:
shr.u32 %r81, %r592, 1;
setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r389, %r81, %r3, %r49;
mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r592, 7;
mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r391, %r3, %r50;
add.s32 %r392, %r49, %r391;
mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r594, %r608;
$L__BB0_136:
shr.u32 %r86, %r594, 1;
setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r393, %r86, %r3, %r49;
mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r594, 7;
mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r395, %r3, %r50;
add.s32 %r396, %r49, %r395;
mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r596, %r608;
$L__BB0_147:
shr.u32 %r90, %r596, 1;
setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r397, %r90, %r3, %r49;
mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r596, 7;
mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r399, %r3, %r50;
add.s32 %r400, %r49, %r399;
mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r598, %r608;
$L__BB0_158:
shr.u32 %r94, %r598, 1;
setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r401, %r94, %r3, %r49;
mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r598, 7;
mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r403, %r3, %r50;
add.s32 %r404, %r49, %r403;
mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r600, %r608;
$L__BB0_169:
shr.u32 %r98, %r600, 1;
setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r405, %r98, %r3, %r49;
mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r600, 7;
mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r407, %r3, %r50;
add.s32 %r408, %r49, %r407;
mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r602, %r608;
$L__BB0_180:
shr.u32 %r102, %r602, 1;
setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r409, %r102, %r3, %r49;
mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r602, 7;
mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r411, %r3, %r50;
add.s32 %r412, %r49, %r411;
mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r604, %r608;
$L__BB0_191:
shr.u32 %r106, %r604, 1;
setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r413, %r106, %r3, %r49;
mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r604, 7;
mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r415, %r3, %r50;
add.s32 %r416, %r49, %r415;
mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r606, %r608;
$L__BB0_202:
shr.u32 %r110, %r606, 1;
setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r417, %r110, %r3, %r49;
mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r606, 7;
mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r419, %r3, %r50;
add.s32 %r420, %r49, %r419;
mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r114, %r608, 1;
setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r421, %r114, %r3, %r49;
mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r608, 7;
mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
mov.u32 %r447, %ctaid.y;
mad.lo.s32 %r448, %r201, %r447, %r8;
add.s32 %r449, %r448, %r84;
mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
// end inline asm
add.s32 %r450, %r449, 4;
mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r423, %r8, 3;
sub.s32 %r117, %r423, %r201;
mov.u32 %r424, %ctaid.y;
mad.lo.s32 %r118, %r201, %r424, %r8;
neg.s32 %r425, %r84;
setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r430, %r118, %r84;
mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
// end inline asm
$L__BB0_222:
mov.u32 %r431, -4;
sub.s32 %r432, %r431, %r84;
setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r437, %r118, %r84;
add.s32 %r438, %r437, 4;
mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
// end inline asm
$L__BB0_226:
shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
mov.u32 %r475, %ctaid.y;
mad.lo.s32 %r476, %r201, %r475, %r8;
add.s32 %r477, %r476, %r119;
mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
// end inline asm
add.s32 %r478, %r477, 4;
mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r451, %r8, 3;
sub.s32 %r120, %r451, %r201;
mov.u32 %r452, %ctaid.y;
mad.lo.s32 %r121, %r201, %r452, %r8;
neg.s32 %r453, %r119;
setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r458, %r121, %r119;
mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
// end inline asm
$L__BB0_229:
mov.u32 %r459, -4;
sub.s32 %r460, %r459, %r119;
setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r465, %r121, %r119;
add.s32 %r466, %r465, 4;
mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
// end inline asm
$L__BB0_233:
mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r479, %r5, %r9;
or.b32 %r481, %r479, %r352;
setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r482, %ctaid.x;
mov.u32 %r483, %ctaid.z;
mov.u32 %r484, %nctaid.x;
mad.lo.s32 %r485, %r483, %r484, %r482;
mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r486, %r11, -1;
setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r610, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r610;
// end inline asm
setp.lt.u32 %p159, %r610, 256;
selp.u32 %r489, 1, 0, %p159;
shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r490, %r11, %r3;
add.s32 %r491, %r490, -1;
div.s32 %r125, %r491, %r3;
setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r493, %r201, 1;
shr.u32 %r494, %r493, 31;
add.s32 %r495, %r493, %r494;
shr.s32 %r496, %r495, 1;
add.s32 %r497, %r4, %r496;
add.s32 %r498, %r497, -1;
shl.b32 %r499, %r9, 1;
shl.b32 %r500, %r4, 1;
mad.lo.s32 %r501, %r500, %r122, %r499;
or.b32 %r502, %r501, 1;
setp.ge.s32 %p162, %r502, %r201;
div.s32 %r503, %r498, %r4;
setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r504, %r4, %r122;
shl.b32 %r505, %r504, 1;
mad.lo.s32 %r506, %r201, %r5, %r505;
add.s32 %r612, %r506, %r499;
mul.lo.s32 %r127, %r201, %r3;
mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r611, %r5;
mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r611, %r11;
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
add.s32 %r612, %r612, %r127;
add.s32 %r611, %r611, %r3;
add.s32 %r613, %r613, 1;
setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r513, %r3;
mov.u32 %r514, 31;
sub.s32 %r515, %r514, %r513;
mov.u32 %r516, 1;
shl.b32 %r138, %r516, %r515;
setp.lt.u32 %p166, %r5, %r138;
add.s32 %r517, %r138, %r5;
setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r518, %r49, %r138;
mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r519, %r138, 31;
add.s32 %r520, %r138, %r519;
shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r616, %r624;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
add.s32 %r521, %r616, %r49;
mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r141, %r616, 1;
setp.gt.u32 %p171, %r616, 3;
mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r522, %r49, 1;
mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r617, %r624;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
add.s32 %r523, %r617, %r49;
mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r143, %r617, 1;
setp.gt.u32 %p177, %r617, 3;
mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r524, %r201, 1;
shr.u32 %r525, %r524, 31;
add.s32 %r526, %r524, %r525;
shr.s32 %r527, %r526, 1;
add.s32 %r528, %r4, %r527;
add.s32 %r529, %r528, -1;
div.s32 %r530, %r529, %r4;
setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
shl.b32 %r144, %r9, 1;
mul.lo.s32 %r531, %r4, %r122;
shl.b32 %r145, %r531, 1;
add.s32 %r532, %r144, %r145;
or.b32 %r533, %r532, 1;
setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r536, %r201, 1;
shr.u32 %r537, %r536, 31;
add.s32 %r538, %r536, %r537;
shr.s32 %r539, %r538, 1;
add.s32 %r540, %r4, %r539;
add.s32 %r541, %r540, -1;
shl.b32 %r542, %r9, 1;
shl.b32 %r543, %r4, 1;
mad.lo.s32 %r544, %r543, %r122, %r542;
or.b32 %r545, %r544, 1;
setp.ge.s32 %p184, %r545, %r201;
div.s32 %r546, %r541, %r4;
setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r547, %r4, %r122;
shl.b32 %r548, %r547, 1;
mad.lo.s32 %r549, %r201, %r5, %r548;
add.s32 %r619, %r549, %r542;
mul.lo.s32 %r147, %r201, %r3;
mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r618, %r5;
mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r618, %r11;
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
add.s32 %r619, %r619, %r147;
add.s32 %r618, %r618, %r3;
add.s32 %r620, %r620, 1;
setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r623, %r624;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
add.s32 %r556, %r623, %r49;
mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r159, %r623, 1;
setp.gt.u32 %p191, %r623, 3;
mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
add.s32 %r557, %r624, %r49;
mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r161, %r624, 1;
setp.gt.u32 %p197, %r624, 3;
mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r558, %r201, 1;
shr.u32 %r559, %r558, 31;
add.s32 %r560, %r558, %r559;
shr.s32 %r561, %r560, 1;
add.s32 %r562, %r4, %r561;
add.s32 %r563, %r562, -1;
div.s32 %r564, %r563, %r4;
setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
shl.b32 %r162, %r9, 1;
mul.lo.s32 %r565, %r4, %r122;
shl.b32 %r163, %r565, 1;
add.s32 %r566, %r162, %r163;
or.b32 %r567, %r566, 1;
setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_28_cu_16ee897e_723310nvfuser_28ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,173 +32,173 @@
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
- .reg .b32 %r<631>;
+ .reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r200, %r201}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r210, %r211}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r214, %r215}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r237, %r202, 7;
- shr.s32 %r238, %r237, 31;
- shr.u32 %r239, %r238, 29;
- add.s32 %r240, %r237, %r239;
- shr.s32 %r2, %r240, 3;
+ add.s32 %r236, %r201, 7;
+ shr.s32 %r237, %r236, 31;
+ shr.u32 %r238, %r237, 29;
+ add.s32 %r239, %r236, %r238;
+ shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
- mov.u32 %r241, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
+ mov.u32 %r240, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r242, [%rd42], %r5;
+ atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r243, %r4, %r2;
- shl.b32 %r244, %r243, 4;
- or.b32 %r245, %r244, 15;
- and.b32 %r7, %r245, -16;
- add.s32 %r246, %r245, %r7;
- and.b32 %r247, %r246, -16;
- cvt.s64.s32 %rd1, %r247;
- shl.b32 %r248, %r4, 2;
- max.s32 %r249, %r2, %r3;
- mad.lo.s32 %r250, %r248, %r249, 15;
- and.b32 %r251, %r250, -16;
- cvt.u64.u32 %rd2, %r251;
+ mul.lo.s32 %r242, %r4, %r2;
+ shl.b32 %r243, %r242, 4;
+ or.b32 %r244, %r243, 15;
+ and.b32 %r7, %r244, -16;
+ add.s32 %r245, %r244, %r7;
+ and.b32 %r246, %r245, -16;
+ cvt.s64.s32 %rd1, %r246;
+ shl.b32 %r247, %r4, 2;
+ max.s32 %r248, %r2, %r3;
+ mad.lo.s32 %r249, %r247, %r248, 15;
+ and.b32 %r250, %r249, -16;
+ cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r252, %r8, 7;
- setp.lt.s32 %p11, %r252, %r202;
+ or.b32 %r251, %r8, 7;
+ setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
-
-
- shl.b32 %r256, %r5, 4;
- add.s32 %r254, %r253, %r256;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
+
+
+ shl.b32 %r255, %r5, 4;
+ add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
- mov.u32 %r255, 0;
+ mov.u32 %r254, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r255, 0;
- cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r254, 0;
+ cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r583, %r6, 4;
- add.s32 %r257, %r4, 215;
- div.s32 %r258, %r257, %r4;
+ shl.b32 %r577, %r6, 4;
+ add.s32 %r256, %r4, 215;
+ div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r259, %r11, %r258;
- add.s32 %r260, %r259, -1;
- div.s32 %r12, %r260, %r11;
+ add.s32 %r258, %r11, %r257;
+ add.s32 %r259, %r258, -1;
+ div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r202;
+ cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
- mov.u32 %r262, %ctaid.y;
- mul.lo.s32 %r263, %r12, %r4;
- mul.lo.s32 %r13, %r263, %r262;
- shl.b32 %r264, %r9, 1;
- mov.u32 %r265, 1;
- shl.b32 %r266, %r5, 4;
- mad.lo.s32 %r14, %r264, %r202, %r266;
- mul.lo.s32 %r267, %r202, %r9;
- cvt.s64.s32 %rd52, %r267;
+ mov.u32 %r261, %ctaid.y;
+ mul.lo.s32 %r262, %r12, %r4;
+ mul.lo.s32 %r13, %r262, %r261;
+ mad.lo.s32 %r263, %r2, %r9, %r5;
+ shl.b32 %r14, %r263, 4;
+ mul.lo.s32 %r264, %r201, %r9;
+ cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r268, %r13, %r202;
- cvt.s64.s32 %rd6, %r268;
+ mul.lo.s32 %r265, %r13, %r201;
+ cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- mov.u32 %r269, %tid.z;
- mad.lo.s32 %r270, %r4, %r269, %r9;
- mad.lo.s32 %r15, %r270, %r3, %r5;
+ mov.u32 %r266, %tid.z;
+ mad.lo.s32 %r267, %r4, %r266, %r9;
+ mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
- clz.b32 %r271, %r3;
- mov.u32 %r272, 31;
- sub.s32 %r273, %r272, %r271;
- shl.b32 %r16, %r265, %r273;
+ clz.b32 %r268, %r3;
+ mov.u32 %r269, 31;
+ sub.s32 %r270, %r269, %r268;
+ mov.u32 %r271, 1;
+ shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
- add.s32 %r274, %r16, %r5;
- setp.lt.u32 %p18, %r274, %r3;
+ add.s32 %r272, %r16, %r5;
+ setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
- add.s32 %r275, %r15, %r16;
- mul.wide.s32 %rd55, %r275, 4;
+ add.s32 %r273, %r15, %r16;
+ mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
- shr.u32 %r276, %r16, 31;
- add.s32 %r277, %r16, %r276;
- shr.s32 %r17, %r277, 1;
- add.s32 %r18, %r267, %r8;
+ shr.u32 %r274, %r16, 31;
+ add.s32 %r275, %r16, %r274;
+ shr.s32 %r17, %r275, 1;
+ shl.b32 %r276, %r9, 3;
+ mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
- mul.wide.s32 %rd57, %r18, 2;
+ mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
- mul.wide.s32 %rd61, %r270, 4;
+ mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
- mov.u32 %r580, 0;
+ mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
- add.s32 %r282, %r14, %r281;
+ add.s32 %r282, %r281, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
- add.s32 %r285, %r14, %r284;
+ add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
@@ -215,30 +215,30 @@
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r580, %r4;
- add.s32 %r279, %r23, %r9;
- add.s32 %r24, %r279, %r13;
- setp.gt.s32 %p19, %r24, 215;
+ mul.lo.s32 %r22, %r574, %r4;
+ add.s32 %r279, %r22, %r9;
+ add.s32 %r23, %r279, %r13;
+ setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
- mul.lo.s32 %r280, %r24, %r211;
+ mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p20, %r24, 216;
+ setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
- mul.lo.s32 %r578, %r580, %r4;
- mul.lo.s32 %r287, %r578, %r202;
+ mul.lo.s32 %r572, %r574, %r4;
+ mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
@@ -260,19 +260,19 @@
}
$L__BB0_11:
- mul.lo.s32 %r576, %r580, %r4;
- add.s32 %r575, %r576, %r9;
- add.s32 %r574, %r575, %r13;
- setp.gt.s32 %p204, %r574, 215;
+ mul.lo.s32 %r571, %r574, %r4;
+ add.s32 %r570, %r571, %r9;
+ add.s32 %r569, %r570, %r13;
+ setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
- mul.lo.s32 %r288, %r24, %r215;
+ mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
@@ -465,11 +465,11 @@
mov.f32 %f641, %f640;
$L__BB0_16:
- shl.b32 %r583, %r583, 2;
+ shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
@@ -480,29 +480,29 @@
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
- mov.u32 %r581, %r17;
+ mov.u32 %r575, %r17;
$L__BB0_20:
- setp.ge.u32 %p26, %r5, %r581;
+ setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
- add.s32 %r317, %r581, %r15;
+ add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
- shr.u32 %r36, %r581, 1;
- setp.gt.u32 %p27, %r581, 3;
- mov.u32 %r581, %r36;
+ shr.u32 %r35, %r575, 1;
+ setp.gt.u32 %p27, %r575, 3;
+ mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
@@ -529,29 +529,29 @@
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
- mov.u32 %r582, %r17;
+ mov.u32 %r576, %r17;
$L__BB0_30:
- setp.ge.u32 %p32, %r5, %r582;
+ setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
- add.s32 %r318, %r582, %r15;
+ add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
- shr.u32 %r38, %r582, 1;
- setp.gt.u32 %p33, %r582, 3;
- mov.u32 %r582, %r38;
+ shr.u32 %r37, %r576, 1;
+ setp.gt.u32 %p33, %r576, 3;
+ mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
@@ -589,11 +589,10 @@
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
- mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
@@ -801,13 +800,12 @@
{ cvt.rn.f16.f32 %rs124, %f365;}
mov.b32 %r326, {%rs124, %rs128};
- add.s32 %r351, %r13, %r577;
- mad.lo.s32 %r352, %r351, %r202, %r18;
- mul.wide.s32 %rd82, %r352, 2;
+ mad.lo.s32 %r351, %r23, %r201, %r8;
+ mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
bra.uni $L__BB0_43;
@@ -817,12 +815,12 @@
{ cvt.rn.f16.f32 %rs61, %f301;}
$L__BB0_43:
- add.s32 %r580, %r580, 1;
- setp.lt.s32 %p39, %r580, %r12;
+ add.s32 %r574, %r574, 1;
+ setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
@@ -841,68 +839,68 @@
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
- mov.u32 %r353, %tid.z;
- mad.lo.s32 %r354, %r4, %r353, %r9;
- mad.lo.s32 %r50, %r354, %r3, %r5;
- mul.wide.u32 %rd83, %r50, 4;
+ mov.u32 %r352, %tid.z;
+ mad.lo.s32 %r353, %r4, %r352, %r9;
+ mad.lo.s32 %r49, %r353, %r3, %r5;
+ mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
- clz.b32 %r355, %r4;
- mov.u32 %r356, 31;
- sub.s32 %r51, %r356, %r355;
- mov.u32 %r357, 1;
- shl.b32 %r614, %r357, %r51;
- setp.lt.u32 %p40, %r9, %r614;
- add.s32 %r358, %r614, %r9;
- setp.lt.u32 %p41, %r358, %r4;
+ clz.b32 %r354, %r4;
+ mov.u32 %r355, 31;
+ sub.s32 %r50, %r355, %r354;
+ mov.u32 %r356, 1;
+ shl.b32 %r608, %r356, %r50;
+ setp.lt.u32 %p40, %r9, %r608;
+ add.s32 %r357, %r608, %r9;
+ setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
- shl.b32 %r359, %r3, %r51;
- add.s32 %r360, %r50, %r359;
- mul.wide.s32 %rd85, %r360, 4;
+ shl.b32 %r358, %r3, %r50;
+ add.s32 %r359, %r49, %r358;
+ mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
- setp.lt.s32 %p43, %r614, 4;
+ setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
- mov.u32 %r584, %r614;
+ mov.u32 %r578, %r608;
$L__BB0_48:
- shr.u32 %r54, %r584, 1;
- setp.ge.u32 %p44, %r9, %r54;
+ shr.u32 %r53, %r578, 1;
+ setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
- mad.lo.s32 %r361, %r54, %r3, %r50;
- mul.wide.s32 %rd88, %r361, 4;
+ mad.lo.s32 %r360, %r53, %r3, %r49;
+ mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
- setp.gt.u32 %p45, %r584, 7;
- mov.u32 %r584, %r54;
+ setp.gt.u32 %p45, %r578, 7;
+ mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
- mov.u32 %r585, 0;
- add.s32 %r363, %r50, %r3;
- mul.wide.u32 %rd91, %r363, 4;
+ mov.u32 %r579, 0;
+ add.s32 %r362, %r49, %r3;
+ mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
@@ -911,54 +909,54 @@
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
- mov.b32 %r585, %f660;
+ mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
- shl.b32 %r364, %r3, %r51;
- add.s32 %r365, %r50, %r364;
- mul.wide.s32 %rd93, %r365, 4;
+ shl.b32 %r363, %r3, %r50;
+ add.s32 %r364, %r49, %r363;
+ mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
- mov.u32 %r586, %r614;
+ mov.u32 %r580, %r608;
$L__BB0_59:
- shr.u32 %r58, %r586, 1;
- setp.ge.u32 %p50, %r9, %r58;
+ shr.u32 %r57, %r580, 1;
+ setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
- mad.lo.s32 %r366, %r58, %r3, %r50;
- mul.wide.s32 %rd96, %r366, 4;
+ mad.lo.s32 %r365, %r57, %r3, %r49;
+ mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
- setp.gt.u32 %p51, %r586, 7;
- mov.u32 %r586, %r58;
+ setp.gt.u32 %p51, %r580, 7;
+ mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
- mov.u32 %r587, 0;
+ mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@@ -966,54 +964,54 @@
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
- mov.b32 %r587, %f661;
+ mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
- shl.b32 %r368, %r3, %r51;
- add.s32 %r369, %r50, %r368;
- mul.wide.s32 %rd99, %r369, 4;
+ shl.b32 %r367, %r3, %r50;
+ add.s32 %r368, %r49, %r367;
+ mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
- mov.u32 %r588, %r614;
+ mov.u32 %r582, %r608;
$L__BB0_70:
- shr.u32 %r62, %r588, 1;
- setp.ge.u32 %p56, %r9, %r62;
+ shr.u32 %r61, %r582, 1;
+ setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
- mad.lo.s32 %r370, %r62, %r3, %r50;
- mul.wide.s32 %rd102, %r370, 4;
+ mad.lo.s32 %r369, %r61, %r3, %r49;
+ mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
- setp.gt.u32 %p57, %r588, 7;
- mov.u32 %r588, %r62;
+ setp.gt.u32 %p57, %r582, 7;
+ mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
- mov.u32 %r589, 0;
+ mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@@ -1021,54 +1019,54 @@
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
- mov.b32 %r589, %f662;
+ mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
- shl.b32 %r372, %r3, %r51;
- add.s32 %r373, %r50, %r372;
- mul.wide.s32 %rd105, %r373, 4;
+ shl.b32 %r371, %r3, %r50;
+ add.s32 %r372, %r49, %r371;
+ mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
- mov.u32 %r590, %r614;
+ mov.u32 %r584, %r608;
$L__BB0_81:
- shr.u32 %r66, %r590, 1;
- setp.ge.u32 %p62, %r9, %r66;
+ shr.u32 %r65, %r584, 1;
+ setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
- mad.lo.s32 %r374, %r66, %r3, %r50;
- mul.wide.s32 %rd108, %r374, 4;
+ mad.lo.s32 %r373, %r65, %r3, %r49;
+ mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
- setp.gt.u32 %p63, %r590, 7;
- mov.u32 %r590, %r66;
+ setp.gt.u32 %p63, %r584, 7;
+ mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
- mov.u32 %r591, 0;
+ mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@@ -1076,54 +1074,54 @@
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
- mov.b32 %r591, %f663;
+ mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
- shl.b32 %r376, %r3, %r51;
- add.s32 %r377, %r50, %r376;
- mul.wide.s32 %rd111, %r377, 4;
+ shl.b32 %r375, %r3, %r50;
+ add.s32 %r376, %r49, %r375;
+ mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
- mov.u32 %r592, %r614;
+ mov.u32 %r586, %r608;
$L__BB0_92:
- shr.u32 %r70, %r592, 1;
- setp.ge.u32 %p68, %r9, %r70;
+ shr.u32 %r69, %r586, 1;
+ setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
- mad.lo.s32 %r378, %r70, %r3, %r50;
- mul.wide.s32 %rd114, %r378, 4;
+ mad.lo.s32 %r377, %r69, %r3, %r49;
+ mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
- setp.gt.u32 %p69, %r592, 7;
- mov.u32 %r592, %r70;
+ setp.gt.u32 %p69, %r586, 7;
+ mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
- mov.u32 %r593, 0;
+ mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@@ -1131,54 +1129,54 @@
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
- mov.b32 %r593, %f664;
+ mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
- shl.b32 %r380, %r3, %r51;
- add.s32 %r381, %r50, %r380;
- mul.wide.s32 %rd117, %r381, 4;
+ shl.b32 %r379, %r3, %r50;
+ add.s32 %r380, %r49, %r379;
+ mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
- mov.u32 %r594, %r614;
+ mov.u32 %r588, %r608;
$L__BB0_103:
- shr.u32 %r74, %r594, 1;
- setp.ge.u32 %p74, %r9, %r74;
+ shr.u32 %r73, %r588, 1;
+ setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
- mad.lo.s32 %r382, %r74, %r3, %r50;
- mul.wide.s32 %rd120, %r382, 4;
+ mad.lo.s32 %r381, %r73, %r3, %r49;
+ mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
- setp.gt.u32 %p75, %r594, 7;
- mov.u32 %r594, %r74;
+ setp.gt.u32 %p75, %r588, 7;
+ mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
- mov.u32 %r595, 0;
+ mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@@ -1186,54 +1184,54 @@
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
- mov.b32 %r595, %f665;
+ mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
- shl.b32 %r384, %r3, %r51;
- add.s32 %r385, %r50, %r384;
- mul.wide.s32 %rd123, %r385, 4;
+ shl.b32 %r383, %r3, %r50;
+ add.s32 %r384, %r49, %r383;
+ mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
- mov.u32 %r596, %r614;
+ mov.u32 %r590, %r608;
$L__BB0_114:
- shr.u32 %r78, %r596, 1;
- setp.ge.u32 %p80, %r9, %r78;
+ shr.u32 %r77, %r590, 1;
+ setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
- mad.lo.s32 %r386, %r78, %r3, %r50;
- mul.wide.s32 %rd126, %r386, 4;
+ mad.lo.s32 %r385, %r77, %r3, %r49;
+ mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
- setp.gt.u32 %p81, %r596, 7;
- mov.u32 %r596, %r78;
+ setp.gt.u32 %p81, %r590, 7;
+ mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
- mov.u32 %r597, 0;
+ mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@@ -1241,54 +1239,54 @@
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
- mov.b32 %r597, %f666;
+ mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
- shl.b32 %r388, %r3, %r51;
- add.s32 %r389, %r50, %r388;
- mul.wide.s32 %rd129, %r389, 4;
+ shl.b32 %r387, %r3, %r50;
+ add.s32 %r388, %r49, %r387;
+ mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
- mov.u32 %r598, %r614;
+ mov.u32 %r592, %r608;
$L__BB0_125:
- shr.u32 %r82, %r598, 1;
- setp.ge.u32 %p86, %r9, %r82;
+ shr.u32 %r81, %r592, 1;
+ setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
- mad.lo.s32 %r390, %r82, %r3, %r50;
- mul.wide.s32 %rd132, %r390, 4;
+ mad.lo.s32 %r389, %r81, %r3, %r49;
+ mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
- setp.gt.u32 %p87, %r598, 7;
- mov.u32 %r598, %r82;
+ setp.gt.u32 %p87, %r592, 7;
+ mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
- mov.u32 %r599, 0;
+ mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@@ -1296,55 +1294,55 @@
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
- mov.b32 %r599, %f667;
+ mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
- shl.b32 %r85, %r583, 4;
+ shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
- shl.b32 %r392, %r3, %r51;
- add.s32 %r393, %r50, %r392;
- mul.wide.s32 %rd135, %r393, 4;
+ shl.b32 %r391, %r3, %r50;
+ add.s32 %r392, %r49, %r391;
+ mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
- mov.u32 %r600, %r614;
+ mov.u32 %r594, %r608;
$L__BB0_136:
- shr.u32 %r87, %r600, 1;
- setp.ge.u32 %p92, %r9, %r87;
+ shr.u32 %r86, %r594, 1;
+ setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
- mad.lo.s32 %r394, %r87, %r3, %r50;
- mul.wide.s32 %rd138, %r394, 4;
+ mad.lo.s32 %r393, %r86, %r3, %r49;
+ mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
- setp.gt.u32 %p93, %r600, 7;
- mov.u32 %r600, %r87;
+ setp.gt.u32 %p93, %r594, 7;
+ mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
- mov.u32 %r601, 0;
+ mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@@ -1352,54 +1350,54 @@
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
- mov.b32 %r601, %f668;
+ mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
- shl.b32 %r396, %r3, %r51;
- add.s32 %r397, %r50, %r396;
- mul.wide.s32 %rd141, %r397, 4;
+ shl.b32 %r395, %r3, %r50;
+ add.s32 %r396, %r49, %r395;
+ mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
- mov.u32 %r602, %r614;
+ mov.u32 %r596, %r608;
$L__BB0_147:
- shr.u32 %r91, %r602, 1;
- setp.ge.u32 %p98, %r9, %r91;
+ shr.u32 %r90, %r596, 1;
+ setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
- mad.lo.s32 %r398, %r91, %r3, %r50;
- mul.wide.s32 %rd144, %r398, 4;
+ mad.lo.s32 %r397, %r90, %r3, %r49;
+ mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
- setp.gt.u32 %p99, %r602, 7;
- mov.u32 %r602, %r91;
+ setp.gt.u32 %p99, %r596, 7;
+ mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
- mov.u32 %r603, 0;
+ mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@@ -1407,54 +1405,54 @@
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
- mov.b32 %r603, %f669;
+ mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
- shl.b32 %r400, %r3, %r51;
- add.s32 %r401, %r50, %r400;
- mul.wide.s32 %rd147, %r401, 4;
+ shl.b32 %r399, %r3, %r50;
+ add.s32 %r400, %r49, %r399;
+ mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
- mov.u32 %r604, %r614;
+ mov.u32 %r598, %r608;
$L__BB0_158:
- shr.u32 %r95, %r604, 1;
- setp.ge.u32 %p104, %r9, %r95;
+ shr.u32 %r94, %r598, 1;
+ setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
- mad.lo.s32 %r402, %r95, %r3, %r50;
- mul.wide.s32 %rd150, %r402, 4;
+ mad.lo.s32 %r401, %r94, %r3, %r49;
+ mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
- setp.gt.u32 %p105, %r604, 7;
- mov.u32 %r604, %r95;
+ setp.gt.u32 %p105, %r598, 7;
+ mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
- mov.u32 %r605, 0;
+ mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@@ -1462,54 +1460,54 @@
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
- mov.b32 %r605, %f670;
+ mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
- shl.b32 %r404, %r3, %r51;
- add.s32 %r405, %r50, %r404;
- mul.wide.s32 %rd153, %r405, 4;
+ shl.b32 %r403, %r3, %r50;
+ add.s32 %r404, %r49, %r403;
+ mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
- mov.u32 %r606, %r614;
+ mov.u32 %r600, %r608;
$L__BB0_169:
- shr.u32 %r99, %r606, 1;
- setp.ge.u32 %p110, %r9, %r99;
+ shr.u32 %r98, %r600, 1;
+ setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
- mad.lo.s32 %r406, %r99, %r3, %r50;
- mul.wide.s32 %rd156, %r406, 4;
+ mad.lo.s32 %r405, %r98, %r3, %r49;
+ mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
- setp.gt.u32 %p111, %r606, 7;
- mov.u32 %r606, %r99;
+ setp.gt.u32 %p111, %r600, 7;
+ mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
- mov.u32 %r607, 0;
+ mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@@ -1517,54 +1515,54 @@
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
- mov.b32 %r607, %f671;
+ mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
- shl.b32 %r408, %r3, %r51;
- add.s32 %r409, %r50, %r408;
- mul.wide.s32 %rd159, %r409, 4;
+ shl.b32 %r407, %r3, %r50;
+ add.s32 %r408, %r49, %r407;
+ mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
- mov.u32 %r608, %r614;
+ mov.u32 %r602, %r608;
$L__BB0_180:
- shr.u32 %r103, %r608, 1;
- setp.ge.u32 %p116, %r9, %r103;
+ shr.u32 %r102, %r602, 1;
+ setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
- mad.lo.s32 %r410, %r103, %r3, %r50;
- mul.wide.s32 %rd162, %r410, 4;
+ mad.lo.s32 %r409, %r102, %r3, %r49;
+ mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
- setp.gt.u32 %p117, %r608, 7;
- mov.u32 %r608, %r103;
+ setp.gt.u32 %p117, %r602, 7;
+ mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
- mov.u32 %r609, 0;
+ mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@@ -1572,54 +1570,54 @@
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
- mov.b32 %r609, %f672;
+ mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
- shl.b32 %r412, %r3, %r51;
- add.s32 %r413, %r50, %r412;
- mul.wide.s32 %rd165, %r413, 4;
+ shl.b32 %r411, %r3, %r50;
+ add.s32 %r412, %r49, %r411;
+ mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
- mov.u32 %r610, %r614;
+ mov.u32 %r604, %r608;
$L__BB0_191:
- shr.u32 %r107, %r610, 1;
- setp.ge.u32 %p122, %r9, %r107;
+ shr.u32 %r106, %r604, 1;
+ setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
- mad.lo.s32 %r414, %r107, %r3, %r50;
- mul.wide.s32 %rd168, %r414, 4;
+ mad.lo.s32 %r413, %r106, %r3, %r49;
+ mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
- setp.gt.u32 %p123, %r610, 7;
- mov.u32 %r610, %r107;
+ setp.gt.u32 %p123, %r604, 7;
+ mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
- mov.u32 %r611, 0;
+ mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@@ -1627,54 +1625,54 @@
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
- mov.b32 %r611, %f673;
+ mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
- shl.b32 %r416, %r3, %r51;
- add.s32 %r417, %r50, %r416;
- mul.wide.s32 %rd171, %r417, 4;
+ shl.b32 %r415, %r3, %r50;
+ add.s32 %r416, %r49, %r415;
+ mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
- mov.u32 %r612, %r614;
+ mov.u32 %r606, %r608;
$L__BB0_202:
- shr.u32 %r111, %r612, 1;
- setp.ge.u32 %p128, %r9, %r111;
+ shr.u32 %r110, %r606, 1;
+ setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
- mad.lo.s32 %r418, %r111, %r3, %r50;
- mul.wide.s32 %rd174, %r418, 4;
+ mad.lo.s32 %r417, %r110, %r3, %r49;
+ mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
- setp.gt.u32 %p129, %r612, 7;
- mov.u32 %r612, %r111;
+ setp.gt.u32 %p129, %r606, 7;
+ mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
- mov.u32 %r613, 0;
+ mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@@ -1682,21 +1680,21 @@
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
- mov.b32 %r613, %f674;
+ mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
- shl.b32 %r420, %r3, %r51;
- add.s32 %r421, %r50, %r420;
- mul.wide.s32 %rd177, %r421, 4;
+ shl.b32 %r419, %r3, %r50;
+ add.s32 %r420, %r49, %r419;
+ mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
@@ -1704,30 +1702,30 @@
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
- shr.u32 %r115, %r614, 1;
- setp.ge.u32 %p134, %r9, %r115;
+ shr.u32 %r114, %r608, 1;
+ setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
- mad.lo.s32 %r422, %r115, %r3, %r50;
- mul.wide.s32 %rd180, %r422, 4;
+ mad.lo.s32 %r421, %r114, %r3, %r49;
+ mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
- setp.gt.u32 %p135, %r614, 7;
- mov.u32 %r614, %r115;
+ setp.gt.u32 %p135, %r608, 7;
+ mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
- mov.u32 %r615, 0;
+ mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@@ -1735,255 +1733,251 @@
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
- mov.b32 %r615, %f675;
+ mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
- shl.b32 %r573, %r5, 3;
- mov.u32 %r448, %ctaid.y;
- mad.lo.s32 %r449, %r202, %r448, %r573;
- add.s32 %r450, %r449, %r85;
- mul.wide.s32 %rd189, %r450, 4;
+ mov.u32 %r447, %ctaid.y;
+ mad.lo.s32 %r448, %r201, %r447, %r8;
+ add.s32 %r449, %r448, %r84;
+ mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
- st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
-
- add.s32 %r451, %r450, 4;
- mul.wide.s32 %rd190, %r451, 4;
+ st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
+
+ add.s32 %r450, %r449, 4;
+ mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
- st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
bra.uni $L__BB0_226;
$L__BB0_220:
- shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
- add.s32 %r424, %r570, 3;
- sub.s32 %r118, %r424, %r202;
- mov.u32 %r425, %ctaid.y;
- mad.lo.s32 %r119, %r202, %r425, %r570;
- neg.s32 %r426, %r85;
- setp.ge.s32 %p141, %r118, %r426;
+ add.s32 %r423, %r8, 3;
+ sub.s32 %r117, %r423, %r201;
+ mov.u32 %r424, %ctaid.y;
+ mad.lo.s32 %r118, %r201, %r424, %r8;
+ neg.s32 %r425, %r84;
+ setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
- add.s32 %r431, %r119, %r85;
- mul.wide.s32 %rd184, %r431, 4;
+ add.s32 %r430, %r118, %r84;
+ mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
- st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
+ st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
$L__BB0_222:
- mov.u32 %r432, -4;
- sub.s32 %r433, %r432, %r85;
- setp.ge.s32 %p143, %r118, %r433;
+ mov.u32 %r431, -4;
+ sub.s32 %r432, %r431, %r84;
+ setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
- add.s32 %r438, %r119, %r85;
- add.s32 %r439, %r438, 4;
- mul.wide.s32 %rd186, %r439, 4;
+ add.s32 %r437, %r118, %r84;
+ add.s32 %r438, %r437, 4;
+ mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
$L__BB0_226:
- shl.b32 %r120, %r583, 5;
+ shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
- shl.b32 %r572, %r5, 3;
- mov.u32 %r476, %ctaid.y;
- mad.lo.s32 %r477, %r202, %r476, %r572;
- add.s32 %r478, %r477, %r120;
- mul.wide.s32 %rd197, %r478, 4;
+ mov.u32 %r475, %ctaid.y;
+ mad.lo.s32 %r476, %r201, %r475, %r8;
+ add.s32 %r477, %r476, %r119;
+ mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
- st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
-
- add.s32 %r479, %r478, 4;
- mul.wide.s32 %rd198, %r479, 4;
+ st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
+
+ add.s32 %r478, %r477, 4;
+ mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
- st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
bra.uni $L__BB0_233;
$L__BB0_227:
- shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
- add.s32 %r452, %r571, 3;
- sub.s32 %r121, %r452, %r202;
- mov.u32 %r453, %ctaid.y;
- mad.lo.s32 %r122, %r202, %r453, %r571;
- neg.s32 %r454, %r120;
- setp.ge.s32 %p150, %r121, %r454;
+ add.s32 %r451, %r8, 3;
+ sub.s32 %r120, %r451, %r201;
+ mov.u32 %r452, %ctaid.y;
+ mad.lo.s32 %r121, %r201, %r452, %r8;
+ neg.s32 %r453, %r119;
+ setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
- add.s32 %r459, %r122, %r120;
- mul.wide.s32 %rd192, %r459, 4;
+ add.s32 %r458, %r121, %r119;
+ mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
- st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
+ st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
$L__BB0_229:
- mov.u32 %r460, -4;
- sub.s32 %r461, %r460, %r120;
- setp.ge.s32 %p152, %r121, %r461;
+ mov.u32 %r459, -4;
+ sub.s32 %r460, %r459, %r119;
+ setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
- add.s32 %r466, %r122, %r120;
- add.s32 %r467, %r466, 4;
- mul.wide.s32 %rd194, %r467, 4;
+ add.s32 %r465, %r121, %r119;
+ add.s32 %r466, %r465, 4;
+ mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
- st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
$L__BB0_233:
- mov.u32 %r123, %ctaid.y;
+ mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r480, %r5, %r9;
- or.b32 %r482, %r480, %r353;
- setp.ne.s32 %p156, %r482, 0;
+ or.b32 %r479, %r5, %r9;
+ or.b32 %r481, %r479, %r352;
+ setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
- mov.u32 %r483, %ctaid.x;
- mov.u32 %r484, %ctaid.z;
- mov.u32 %r485, %nctaid.x;
- mad.lo.s32 %r486, %r484, %r485, %r483;
- mul.wide.s32 %rd200, %r486, 8;
+ mov.u32 %r482, %ctaid.x;
+ mov.u32 %r483, %ctaid.z;
+ mov.u32 %r484, %nctaid.x;
+ mad.lo.s32 %r485, %r483, %r484, %r482;
+ mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
- add.s32 %r487, %r11, -1;
- setp.eq.s32 %p157, %r123, %r487;
+ add.s32 %r486, %r11, -1;
+ setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
- mov.u32 %r616, 8;
+ mov.u32 %r610, 8;
$L__BB0_236:
- nanosleep.u32 %r616;
-
- setp.lt.u32 %p159, %r616, 256;
- selp.u32 %r490, 1, 0, %p159;
- shl.b32 %r616, %r616, %r490;
+ nanosleep.u32 %r610;
+
+ setp.lt.u32 %p159, %r610, 256;
+ selp.u32 %r489, 1, 0, %p159;
+ shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
- add.s32 %r491, %r11, %r3;
- add.s32 %r492, %r491, -1;
- div.s32 %r126, %r492, %r3;
- setp.lt.s32 %p161, %r126, 1;
+ add.s32 %r490, %r11, %r3;
+ add.s32 %r491, %r490, -1;
+ div.s32 %r125, %r491, %r3;
+ setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
- add.s32 %r494, %r202, 1;
- shr.u32 %r495, %r494, 31;
- add.s32 %r496, %r494, %r495;
- shr.s32 %r497, %r496, 1;
- add.s32 %r498, %r4, %r497;
- add.s32 %r499, %r498, -1;
- shl.b32 %r500, %r9, 1;
- shl.b32 %r501, %r4, 1;
- mad.lo.s32 %r502, %r501, %r123, %r500;
- or.b32 %r503, %r502, 1;
- setp.ge.s32 %p162, %r503, %r202;
- div.s32 %r504, %r499, %r4;
- setp.ge.s32 %p163, %r123, %r504;
+ add.s32 %r493, %r201, 1;
+ shr.u32 %r494, %r493, 31;
+ add.s32 %r495, %r493, %r494;
+ shr.s32 %r496, %r495, 1;
+ add.s32 %r497, %r4, %r496;
+ add.s32 %r498, %r497, -1;
+ shl.b32 %r499, %r9, 1;
+ shl.b32 %r500, %r4, 1;
+ mad.lo.s32 %r501, %r500, %r122, %r499;
+ or.b32 %r502, %r501, 1;
+ setp.ge.s32 %p162, %r502, %r201;
+ div.s32 %r503, %r498, %r4;
+ setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
- mul.lo.s32 %r505, %r4, %r123;
- shl.b32 %r506, %r505, 1;
- mad.lo.s32 %r507, %r202, %r5, %r506;
- add.s32 %r618, %r507, %r500;
- mul.lo.s32 %r128, %r202, %r3;
- mov.u32 %r493, 0;
+ mul.lo.s32 %r504, %r4, %r122;
+ shl.b32 %r505, %r504, 1;
+ mad.lo.s32 %r506, %r201, %r5, %r505;
+ add.s32 %r612, %r506, %r499;
+ mul.lo.s32 %r127, %r201, %r3;
+ mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
- mov.u32 %r617, %r5;
- mov.u32 %r619, %r493;
+ mov.u32 %r611, %r5;
+ mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
- setp.ge.s32 %p164, %r617, %r11;
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ setp.ge.s32 %p164, %r611, %r11;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
- mul.wide.s32 %rd210, %r618, 4;
+ mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
- ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
+ ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
$L__BB0_242:
- mov.b32 %f558, %r621;
+ mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
- mov.b32 %f559, %r620;
+ mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
- add.s32 %r618, %r618, %r128;
- add.s32 %r617, %r617, %r3;
- add.s32 %r619, %r619, 1;
- setp.lt.s32 %p165, %r619, %r126;
+ add.s32 %r612, %r612, %r127;
+ add.s32 %r611, %r611, %r3;
+ add.s32 %r613, %r613, 1;
+ setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
- clz.b32 %r514, %r3;
- mov.u32 %r515, 31;
- sub.s32 %r516, %r515, %r514;
- mov.u32 %r517, 1;
- shl.b32 %r139, %r517, %r516;
- setp.lt.u32 %p166, %r5, %r139;
- add.s32 %r518, %r139, %r5;
- setp.lt.u32 %p167, %r518, %r3;
+ clz.b32 %r513, %r3;
+ mov.u32 %r514, 31;
+ sub.s32 %r515, %r514, %r513;
+ mov.u32 %r516, 1;
+ shl.b32 %r138, %r516, %r515;
+ setp.lt.u32 %p166, %r5, %r138;
+ add.s32 %r517, %r138, %r5;
+ setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
- add.s32 %r519, %r50, %r139;
- mul.wide.s32 %rd211, %r519, 4;
+ add.s32 %r518, %r49, %r138;
+ mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
- shr.u32 %r520, %r139, 31;
- add.s32 %r521, %r139, %r520;
- shr.s32 %r630, %r521, 1;
+ shr.u32 %r519, %r138, 31;
+ add.s32 %r520, %r138, %r519;
+ shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
@@ -1991,38 +1985,38 @@
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
- setp.lt.s32 %p169, %r139, 4;
+ setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
- mov.u32 %r622, %r630;
+ mov.u32 %r616, %r624;
$L__BB0_247:
- setp.ge.u32 %p170, %r5, %r622;
+ setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
- add.s32 %r522, %r622, %r50;
- mul.wide.s32 %rd213, %r522, 4;
+ add.s32 %r521, %r616, %r49;
+ mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
- shr.u32 %r142, %r622, 1;
- setp.gt.u32 %p171, %r622, 3;
- mov.u32 %r622, %r142;
+ shr.u32 %r141, %r616, 1;
+ setp.gt.u32 %p171, %r616, 3;
+ mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
- add.s32 %r523, %r50, 1;
- mul.wide.u32 %rd216, %r523, 4;
+ add.s32 %r522, %r49, 1;
+ mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
@@ -2050,29 +2044,29 @@
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
- mov.u32 %r623, %r630;
+ mov.u32 %r617, %r624;
$L__BB0_257:
- setp.ge.u32 %p176, %r5, %r623;
+ setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
- add.s32 %r524, %r623, %r50;
- mul.wide.s32 %rd218, %r524, 4;
+ add.s32 %r523, %r617, %r49;
+ mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
- shr.u32 %r144, %r623, 1;
- setp.gt.u32 %p177, %r623, 3;
- mov.u32 %r623, %r144;
+ shr.u32 %r143, %r617, 1;
+ setp.gt.u32 %p177, %r617, 3;
+ mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
@@ -2091,90 +2085,90 @@
{ cvt.rn.f16.f32 %rs130, %f681;}
@%p10 bra $L__BB0_267;
- add.s32 %r525, %r202, 1;
- shr.u32 %r526, %r525, 31;
- add.s32 %r527, %r525, %r526;
- shr.s32 %r528, %r527, 1;
- add.s32 %r529, %r4, %r528;
- add.s32 %r530, %r529, -1;
- div.s32 %r531, %r530, %r4;
- setp.ge.s32 %p181, %r123, %r531;
+ add.s32 %r524, %r201, 1;
+ shr.u32 %r525, %r524, 31;
+ add.s32 %r526, %r524, %r525;
+ shr.s32 %r527, %r526, 1;
+ add.s32 %r528, %r4, %r527;
+ add.s32 %r529, %r528, -1;
+ div.s32 %r530, %r529, %r4;
+ setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
- shl.b32 %r145, %r9, 1;
- mul.lo.s32 %r532, %r4, %r123;
- shl.b32 %r146, %r532, 1;
- add.s32 %r533, %r145, %r146;
- or.b32 %r534, %r533, 1;
- setp.ge.s32 %p182, %r534, %r202;
+ shl.b32 %r144, %r9, 1;
+ mul.lo.s32 %r531, %r4, %r122;
+ shl.b32 %r145, %r531, 1;
+ add.s32 %r532, %r144, %r145;
+ or.b32 %r533, %r532, 1;
+ setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r535, %r146, %r145;
+ add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
- mul.wide.s32 %rd222, %r535, 2;
+ mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
- add.s32 %r537, %r202, 1;
- shr.u32 %r538, %r537, 31;
- add.s32 %r539, %r537, %r538;
- shr.s32 %r540, %r539, 1;
- add.s32 %r541, %r4, %r540;
- add.s32 %r542, %r541, -1;
- shl.b32 %r543, %r9, 1;
- shl.b32 %r544, %r4, 1;
- mad.lo.s32 %r545, %r544, %r123, %r543;
- or.b32 %r546, %r545, 1;
- setp.ge.s32 %p184, %r546, %r202;
- div.s32 %r547, %r542, %r4;
- setp.ge.s32 %p185, %r123, %r547;
+ add.s32 %r536, %r201, 1;
+ shr.u32 %r537, %r536, 31;
+ add.s32 %r538, %r536, %r537;
+ shr.s32 %r539, %r538, 1;
+ add.s32 %r540, %r4, %r539;
+ add.s32 %r541, %r540, -1;
+ shl.b32 %r542, %r9, 1;
+ shl.b32 %r543, %r4, 1;
+ mad.lo.s32 %r544, %r543, %r122, %r542;
+ or.b32 %r545, %r544, 1;
+ setp.ge.s32 %p184, %r545, %r201;
+ div.s32 %r546, %r541, %r4;
+ setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
- mul.lo.s32 %r548, %r4, %r123;
- shl.b32 %r549, %r548, 1;
- mad.lo.s32 %r550, %r202, %r5, %r549;
- add.s32 %r625, %r550, %r543;
- mul.lo.s32 %r148, %r202, %r3;
- mov.u32 %r536, 0;
+ mul.lo.s32 %r547, %r4, %r122;
+ shl.b32 %r548, %r547, 1;
+ mad.lo.s32 %r549, %r201, %r5, %r548;
+ add.s32 %r619, %r549, %r542;
+ mul.lo.s32 %r147, %r201, %r3;
+ mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
- mov.u32 %r624, %r5;
- mov.u32 %r626, %r536;
+ mov.u32 %r618, %r5;
+ mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
- setp.ge.s32 %p186, %r624, %r11;
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ setp.ge.s32 %p186, %r618, %r11;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
- mul.wide.s32 %rd225, %r625, 4;
+ mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
- ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
+ ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
$L__BB0_272:
- mov.b32 %f584, %r628;
+ mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
- mov.b32 %f585, %r627;
+ mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
- add.s32 %r625, %r625, %r148;
- add.s32 %r624, %r624, %r3;
- add.s32 %r626, %r626, 1;
- setp.lt.s32 %p187, %r626, %r126;
+ add.s32 %r619, %r619, %r147;
+ add.s32 %r618, %r618, %r3;
+ add.s32 %r620, %r620, 1;
+ setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@@ -2187,29 +2181,29 @@
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
- mov.u32 %r629, %r630;
+ mov.u32 %r623, %r624;
$L__BB0_277:
- setp.ge.u32 %p190, %r5, %r629;
+ setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
- add.s32 %r557, %r629, %r50;
- mul.wide.s32 %rd226, %r557, 4;
+ add.s32 %r556, %r623, %r49;
+ mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
- shr.u32 %r160, %r629, 1;
- setp.gt.u32 %p191, %r629, 3;
- mov.u32 %r629, %r160;
+ shr.u32 %r159, %r623, 1;
+ setp.gt.u32 %p191, %r623, 3;
+ mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
@@ -2240,26 +2234,26 @@
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
- setp.ge.u32 %p196, %r5, %r630;
+ setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
- add.s32 %r558, %r630, %r50;
- mul.wide.s32 %rd229, %r558, 4;
+ add.s32 %r557, %r624, %r49;
+ mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
- shr.u32 %r162, %r630, 1;
- setp.gt.u32 %p197, %r630, 3;
- mov.u32 %r630, %r162;
+ shr.u32 %r161, %r624, 1;
+ setp.gt.u32 %p197, %r624, 3;
+ mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
@@ -2278,32 +2272,32 @@
{ cvt.rn.f16.f32 %rs132, %f687;}
@%p10 bra $L__BB0_296;
- add.s32 %r559, %r202, 1;
- shr.u32 %r560, %r559, 31;
- add.s32 %r561, %r559, %r560;
- shr.s32 %r562, %r561, 1;
- add.s32 %r563, %r4, %r562;
- add.s32 %r564, %r563, -1;
- div.s32 %r565, %r564, %r4;
- setp.ge.s32 %p201, %r123, %r565;
+ add.s32 %r558, %r201, 1;
+ shr.u32 %r559, %r558, 31;
+ add.s32 %r560, %r558, %r559;
+ shr.s32 %r561, %r560, 1;
+ add.s32 %r562, %r4, %r561;
+ add.s32 %r563, %r562, -1;
+ div.s32 %r564, %r563, %r4;
+ setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
- shl.b32 %r163, %r9, 1;
- mul.lo.s32 %r566, %r4, %r123;
- shl.b32 %r164, %r566, 1;
- add.s32 %r567, %r163, %r164;
- or.b32 %r568, %r567, 1;
- setp.ge.s32 %p202, %r568, %r202;
+ shl.b32 %r162, %r9, 1;
+ mul.lo.s32 %r565, %r4, %r122;
+ shl.b32 %r163, %r565, 1;
+ add.s32 %r566, %r162, %r163;
+ or.b32 %r567, %r566, 1;
+ setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r569, %r164, %r163;
+ add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
- mul.wide.s32 %rd233, %r569, 2;
+ mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
14: CombinedSchedulerTest.LayerNormBackward/dtype___half_batch_216_hidden_96
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 64
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<631>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
shl.b32 %r248, %r4, 2;
max.s32 %r249, %r2, %r3;
mad.lo.s32 %r250, %r248, %r249, 15;
and.b32 %r251, %r250, -16;
cvt.u64.u32 %rd2, %r251;
mov.u64 %rd43, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r252, %r8, 7;
setp.lt.s32 %p11, %r252, %r202;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
// end inline asm
shl.b32 %r256, %r5, 4;
add.s32 %r254, %r253, %r256;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r255, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r255, 0;
cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r583, %r6, 4;
add.s32 %r257, %r4, 215;
div.s32 %r258, %r257, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r259, %r11, %r258;
add.s32 %r260, %r259, -1;
div.s32 %r12, %r260, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r262, %ctaid.y;
mul.lo.s32 %r263, %r12, %r4;
mul.lo.s32 %r13, %r263, %r262;
shl.b32 %r264, %r9, 1;
mov.u32 %r265, 1;
shl.b32 %r266, %r5, 4;
mad.lo.s32 %r14, %r264, %r202, %r266;
mul.lo.s32 %r267, %r202, %r9;
cvt.s64.s32 %rd52, %r267;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r268, %r13, %r202;
cvt.s64.s32 %rd6, %r268;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r269, %tid.z;
mad.lo.s32 %r270, %r4, %r269, %r9;
mad.lo.s32 %r15, %r270, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r271, %r3;
mov.u32 %r272, 31;
sub.s32 %r273, %r272, %r271;
shl.b32 %r16, %r265, %r273;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r274, %r16, %r5;
setp.lt.u32 %p18, %r274, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r275, %r15, %r16;
mul.wide.s32 %rd55, %r275, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r276, %r16, 31;
add.s32 %r277, %r16, %r276;
shr.s32 %r17, %r277, 1;
add.s32 %r18, %r267, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r18, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r270, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r580, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r14, %r281;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r14, %r284;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r580, %r4;
add.s32 %r279, %r23, %r9;
add.s32 %r24, %r279, %r13;
setp.gt.s32 %p19, %r24, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r24, %r211;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r24, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r578, %r580, %r4;
mul.lo.s32 %r287, %r578, %r202;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r576, %r580, %r4;
add.s32 %r575, %r576, %r9;
add.s32 %r574, %r575, %r13;
setp.gt.s32 %p204, %r574, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r24, %r215;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ cvt.f32.f16 %f221, %rs36;}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ cvt.f32.f16 %f222, %rs37;}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ cvt.f32.f16 %f223, %rs38;}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f224, %rs39;}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ cvt.f32.f16 %f225, %rs40;}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ cvt.f32.f16 %f226, %rs41;}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ cvt.f32.f16 %f227, %rs42;}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ cvt.f32.f16 %f228, %rs43;}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ cvt.f32.f16 %f229, %rs44;}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ cvt.f32.f16 %f230, %rs45;}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ cvt.f32.f16 %f231, %rs46;}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ cvt.f32.f16 %f232, %rs47;}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ cvt.f32.f16 %f233, %rs48;}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ cvt.f32.f16 %f234, %rs49;}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ cvt.f32.f16 %f235, %rs50;}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ cvt.f32.f16 %f236, %rs51;}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ cvt.f32.f16 %f237, %rs52;}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ cvt.f32.f16 %f238, %rs53;}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ cvt.f32.f16 %f239, %rs54;}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ cvt.f32.f16 %f240, %rs55;}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ cvt.f32.f16 %f241, %rs56;}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ cvt.f32.f16 %f242, %rs57;}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ cvt.f32.f16 %f243, %rs58;}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ cvt.f32.f16 %f244, %rs59;}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r583, %r583, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r581, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r581;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r581, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r36, %r581, 1;
setp.gt.u32 %p27, %r581, 3;
mov.u32 %r581, %r36;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r582, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r582;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r582, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r38, %r582, 1;
setp.gt.u32 %p33, %r582, 3;
mov.u32 %r582, %r38;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ cvt.f32.f16 %f338, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ cvt.f32.f16 %f339, %rs98;}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ cvt.f32.f16 %f340, %rs99;}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ cvt.f32.f16 %f342, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f343, %rs102;}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ cvt.f32.f16 %f344, %rs103;}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ cvt.f32.f16 %f346, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ cvt.f32.f16 %f347, %rs106;}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ cvt.f32.f16 %f348, %rs107;}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ cvt.f32.f16 %f350, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f351, %rs110;}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ cvt.f32.f16 %f352, %rs111;}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ cvt.f32.f16 %f354, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ cvt.f32.f16 %f355, %rs114;}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ cvt.f32.f16 %f356, %rs115;}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ cvt.f32.f16 %f358, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f359, %rs118;}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ cvt.f32.f16 %f360, %rs119;}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ cvt.f32.f16 %f362, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ cvt.f32.f16 %f363, %rs122;}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ cvt.f32.f16 %f364, %rs123;}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ cvt.f32.f16 %f366, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f367, %rs126;}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ cvt.f32.f16 %f368, %rs127;}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
add.s32 %r351, %r13, %r577;
mad.lo.s32 %r352, %r351, %r202, %r18;
mul.wide.s32 %rd82, %r352, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r580, %r580, 1;
setp.lt.s32 %p39, %r580, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r353, %tid.z;
mad.lo.s32 %r354, %r4, %r353, %r9;
mad.lo.s32 %r50, %r354, %r3, %r5;
mul.wide.u32 %rd83, %r50, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r355, %r4;
mov.u32 %r356, 31;
sub.s32 %r51, %r356, %r355;
mov.u32 %r357, 1;
shl.b32 %r614, %r357, %r51;
setp.lt.u32 %p40, %r9, %r614;
add.s32 %r358, %r614, %r9;
setp.lt.u32 %p41, %r358, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r359, %r3, %r51;
add.s32 %r360, %r50, %r359;
mul.wide.s32 %rd85, %r360, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r614, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r584, %r614;
$L__BB0_48:
shr.u32 %r54, %r584, 1;
setp.ge.u32 %p44, %r9, %r54;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r361, %r54, %r3, %r50;
mul.wide.s32 %rd88, %r361, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r584, 7;
mov.u32 %r584, %r54;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r585, 0;
add.s32 %r363, %r50, %r3;
mul.wide.u32 %rd91, %r363, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r585, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r364, %r3, %r51;
add.s32 %r365, %r50, %r364;
mul.wide.s32 %rd93, %r365, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r586, %r614;
$L__BB0_59:
shr.u32 %r58, %r586, 1;
setp.ge.u32 %p50, %r9, %r58;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r366, %r58, %r3, %r50;
mul.wide.s32 %rd96, %r366, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r586, 7;
mov.u32 %r586, %r58;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r587, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r368, %r3, %r51;
add.s32 %r369, %r50, %r368;
mul.wide.s32 %rd99, %r369, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r588, %r614;
$L__BB0_70:
shr.u32 %r62, %r588, 1;
setp.ge.u32 %p56, %r9, %r62;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r370, %r62, %r3, %r50;
mul.wide.s32 %rd102, %r370, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r588, 7;
mov.u32 %r588, %r62;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r589, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r372, %r3, %r51;
add.s32 %r373, %r50, %r372;
mul.wide.s32 %rd105, %r373, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r590, %r614;
$L__BB0_81:
shr.u32 %r66, %r590, 1;
setp.ge.u32 %p62, %r9, %r66;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r374, %r66, %r3, %r50;
mul.wide.s32 %rd108, %r374, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r590, 7;
mov.u32 %r590, %r66;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r591, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r376, %r3, %r51;
add.s32 %r377, %r50, %r376;
mul.wide.s32 %rd111, %r377, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r592, %r614;
$L__BB0_92:
shr.u32 %r70, %r592, 1;
setp.ge.u32 %p68, %r9, %r70;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r378, %r70, %r3, %r50;
mul.wide.s32 %rd114, %r378, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r592, 7;
mov.u32 %r592, %r70;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r593, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r380, %r3, %r51;
add.s32 %r381, %r50, %r380;
mul.wide.s32 %rd117, %r381, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r594, %r614;
$L__BB0_103:
shr.u32 %r74, %r594, 1;
setp.ge.u32 %p74, %r9, %r74;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r382, %r74, %r3, %r50;
mul.wide.s32 %rd120, %r382, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r594, 7;
mov.u32 %r594, %r74;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r595, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r384, %r3, %r51;
add.s32 %r385, %r50, %r384;
mul.wide.s32 %rd123, %r385, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r596, %r614;
$L__BB0_114:
shr.u32 %r78, %r596, 1;
setp.ge.u32 %p80, %r9, %r78;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r386, %r78, %r3, %r50;
mul.wide.s32 %rd126, %r386, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r596, 7;
mov.u32 %r596, %r78;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r597, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r388, %r3, %r51;
add.s32 %r389, %r50, %r388;
mul.wide.s32 %rd129, %r389, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r598, %r614;
$L__BB0_125:
shr.u32 %r82, %r598, 1;
setp.ge.u32 %p86, %r9, %r82;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r390, %r82, %r3, %r50;
mul.wide.s32 %rd132, %r390, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r598, 7;
mov.u32 %r598, %r82;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r599, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r85, %r583, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r392, %r3, %r51;
add.s32 %r393, %r50, %r392;
mul.wide.s32 %rd135, %r393, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r600, %r614;
$L__BB0_136:
shr.u32 %r87, %r600, 1;
setp.ge.u32 %p92, %r9, %r87;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r394, %r87, %r3, %r50;
mul.wide.s32 %rd138, %r394, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r600, 7;
mov.u32 %r600, %r87;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r601, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r396, %r3, %r51;
add.s32 %r397, %r50, %r396;
mul.wide.s32 %rd141, %r397, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r602, %r614;
$L__BB0_147:
shr.u32 %r91, %r602, 1;
setp.ge.u32 %p98, %r9, %r91;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r398, %r91, %r3, %r50;
mul.wide.s32 %rd144, %r398, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r602, 7;
mov.u32 %r602, %r91;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r603, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r400, %r3, %r51;
add.s32 %r401, %r50, %r400;
mul.wide.s32 %rd147, %r401, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r604, %r614;
$L__BB0_158:
shr.u32 %r95, %r604, 1;
setp.ge.u32 %p104, %r9, %r95;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r402, %r95, %r3, %r50;
mul.wide.s32 %rd150, %r402, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r604, 7;
mov.u32 %r604, %r95;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r605, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r404, %r3, %r51;
add.s32 %r405, %r50, %r404;
mul.wide.s32 %rd153, %r405, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r606, %r614;
$L__BB0_169:
shr.u32 %r99, %r606, 1;
setp.ge.u32 %p110, %r9, %r99;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r406, %r99, %r3, %r50;
mul.wide.s32 %rd156, %r406, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r606, 7;
mov.u32 %r606, %r99;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r607, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r408, %r3, %r51;
add.s32 %r409, %r50, %r408;
mul.wide.s32 %rd159, %r409, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r608, %r614;
$L__BB0_180:
shr.u32 %r103, %r608, 1;
setp.ge.u32 %p116, %r9, %r103;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r410, %r103, %r3, %r50;
mul.wide.s32 %rd162, %r410, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r608, 7;
mov.u32 %r608, %r103;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r609, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r412, %r3, %r51;
add.s32 %r413, %r50, %r412;
mul.wide.s32 %rd165, %r413, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r610, %r614;
$L__BB0_191:
shr.u32 %r107, %r610, 1;
setp.ge.u32 %p122, %r9, %r107;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r414, %r107, %r3, %r50;
mul.wide.s32 %rd168, %r414, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r610, 7;
mov.u32 %r610, %r107;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r611, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r611, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r416, %r3, %r51;
add.s32 %r417, %r50, %r416;
mul.wide.s32 %rd171, %r417, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r612, %r614;
$L__BB0_202:
shr.u32 %r111, %r612, 1;
setp.ge.u32 %p128, %r9, %r111;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r418, %r111, %r3, %r50;
mul.wide.s32 %rd174, %r418, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r612, 7;
mov.u32 %r612, %r111;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r613, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r613, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r420, %r3, %r51;
add.s32 %r421, %r50, %r420;
mul.wide.s32 %rd177, %r421, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r115, %r614, 1;
setp.ge.u32 %p134, %r9, %r115;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r422, %r115, %r3, %r50;
mul.wide.s32 %rd180, %r422, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r614, 7;
mov.u32 %r614, %r115;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r615, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r615, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
shl.b32 %r573, %r5, 3;
mov.u32 %r448, %ctaid.y;
mad.lo.s32 %r449, %r202, %r448, %r573;
add.s32 %r450, %r449, %r85;
mul.wide.s32 %rd189, %r450, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
// end inline asm
add.s32 %r451, %r450, 4;
mul.wide.s32 %rd190, %r451, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r424, %r570, 3;
sub.s32 %r118, %r424, %r202;
mov.u32 %r425, %ctaid.y;
mad.lo.s32 %r119, %r202, %r425, %r570;
neg.s32 %r426, %r85;
setp.ge.s32 %p141, %r118, %r426;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r431, %r119, %r85;
mul.wide.s32 %rd184, %r431, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
// end inline asm
$L__BB0_222:
mov.u32 %r432, -4;
sub.s32 %r433, %r432, %r85;
setp.ge.s32 %p143, %r118, %r433;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r438, %r119, %r85;
add.s32 %r439, %r438, 4;
mul.wide.s32 %rd186, %r439, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
// end inline asm
$L__BB0_226:
shl.b32 %r120, %r583, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
shl.b32 %r572, %r5, 3;
mov.u32 %r476, %ctaid.y;
mad.lo.s32 %r477, %r202, %r476, %r572;
add.s32 %r478, %r477, %r120;
mul.wide.s32 %rd197, %r478, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
// end inline asm
add.s32 %r479, %r478, 4;
mul.wide.s32 %rd198, %r479, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r452, %r571, 3;
sub.s32 %r121, %r452, %r202;
mov.u32 %r453, %ctaid.y;
mad.lo.s32 %r122, %r202, %r453, %r571;
neg.s32 %r454, %r120;
setp.ge.s32 %p150, %r121, %r454;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r459, %r122, %r120;
mul.wide.s32 %rd192, %r459, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
// end inline asm
$L__BB0_229:
mov.u32 %r460, -4;
sub.s32 %r461, %r460, %r120;
setp.ge.s32 %p152, %r121, %r461;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r466, %r122, %r120;
add.s32 %r467, %r466, 4;
mul.wide.s32 %rd194, %r467, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
// end inline asm
$L__BB0_233:
mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r480, %r5, %r9;
or.b32 %r482, %r480, %r353;
setp.ne.s32 %p156, %r482, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r483, %ctaid.x;
mov.u32 %r484, %ctaid.z;
mov.u32 %r485, %nctaid.x;
mad.lo.s32 %r486, %r484, %r485, %r483;
mul.wide.s32 %rd200, %r486, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r487, %r11, -1;
setp.eq.s32 %p157, %r123, %r487;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r616, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r616;
// end inline asm
setp.lt.u32 %p159, %r616, 256;
selp.u32 %r490, 1, 0, %p159;
shl.b32 %r616, %r616, %r490;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r491, %r11, %r3;
add.s32 %r492, %r491, -1;
div.s32 %r126, %r492, %r3;
setp.lt.s32 %p161, %r126, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r494, %r202, 1;
shr.u32 %r495, %r494, 31;
add.s32 %r496, %r494, %r495;
shr.s32 %r497, %r496, 1;
add.s32 %r498, %r4, %r497;
add.s32 %r499, %r498, -1;
shl.b32 %r500, %r9, 1;
shl.b32 %r501, %r4, 1;
mad.lo.s32 %r502, %r501, %r123, %r500;
or.b32 %r503, %r502, 1;
setp.ge.s32 %p162, %r503, %r202;
div.s32 %r504, %r499, %r4;
setp.ge.s32 %p163, %r123, %r504;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r505, %r4, %r123;
shl.b32 %r506, %r505, 1;
mad.lo.s32 %r507, %r202, %r5, %r506;
add.s32 %r618, %r507, %r500;
mul.lo.s32 %r128, %r202, %r3;
mov.u32 %r493, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r617, %r5;
mov.u32 %r619, %r493;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r617, %r11;
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r618, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r621;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r620;
add.f32 %f678, %f678, %f559;
add.s32 %r618, %r618, %r128;
add.s32 %r617, %r617, %r3;
add.s32 %r619, %r619, 1;
setp.lt.s32 %p165, %r619, %r126;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r514, %r3;
mov.u32 %r515, 31;
sub.s32 %r516, %r515, %r514;
mov.u32 %r517, 1;
shl.b32 %r139, %r517, %r516;
setp.lt.u32 %p166, %r5, %r139;
add.s32 %r518, %r139, %r5;
setp.lt.u32 %p167, %r518, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r519, %r50, %r139;
mul.wide.s32 %rd211, %r519, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r520, %r139, 31;
add.s32 %r521, %r139, %r520;
shr.s32 %r630, %r521, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r139, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r622, %r630;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r622;
@%p170 bra $L__BB0_249;
add.s32 %r522, %r622, %r50;
mul.wide.s32 %rd213, %r522, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r142, %r622, 1;
setp.gt.u32 %p171, %r622, 3;
mov.u32 %r622, %r142;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r523, %r50, 1;
mul.wide.u32 %rd216, %r523, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r623, %r630;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r623;
@%p176 bra $L__BB0_259;
add.s32 %r524, %r623, %r50;
mul.wide.s32 %rd218, %r524, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r144, %r623, 1;
setp.gt.u32 %p177, %r623, 3;
mov.u32 %r623, %r144;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r525, %r202, 1;
shr.u32 %r526, %r525, 31;
add.s32 %r527, %r525, %r526;
shr.s32 %r528, %r527, 1;
add.s32 %r529, %r4, %r528;
add.s32 %r530, %r529, -1;
div.s32 %r531, %r530, %r4;
setp.ge.s32 %p181, %r123, %r531;
@%p181 bra $L__BB0_267;
shl.b32 %r145, %r9, 1;
mul.lo.s32 %r532, %r4, %r123;
shl.b32 %r146, %r532, 1;
add.s32 %r533, %r145, %r146;
or.b32 %r534, %r533, 1;
setp.ge.s32 %p182, %r534, %r202;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r535, %r146, %r145;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r535, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r537, %r202, 1;
shr.u32 %r538, %r537, 31;
add.s32 %r539, %r537, %r538;
shr.s32 %r540, %r539, 1;
add.s32 %r541, %r4, %r540;
add.s32 %r542, %r541, -1;
shl.b32 %r543, %r9, 1;
shl.b32 %r544, %r4, 1;
mad.lo.s32 %r545, %r544, %r123, %r543;
or.b32 %r546, %r545, 1;
setp.ge.s32 %p184, %r546, %r202;
div.s32 %r547, %r542, %r4;
setp.ge.s32 %p185, %r123, %r547;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r548, %r4, %r123;
shl.b32 %r549, %r548, 1;
mad.lo.s32 %r550, %r202, %r5, %r549;
add.s32 %r625, %r550, %r543;
mul.lo.s32 %r148, %r202, %r3;
mov.u32 %r536, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r624, %r5;
mov.u32 %r626, %r536;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r624, %r11;
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r625, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r628;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r627;
add.f32 %f684, %f684, %f585;
add.s32 %r625, %r625, %r148;
add.s32 %r624, %r624, %r3;
add.s32 %r626, %r626, 1;
setp.lt.s32 %p187, %r626, %r126;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r629, %r630;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r629;
@%p190 bra $L__BB0_279;
add.s32 %r557, %r629, %r50;
mul.wide.s32 %rd226, %r557, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r160, %r629, 1;
setp.gt.u32 %p191, %r629, 3;
mov.u32 %r629, %r160;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r630;
@%p196 bra $L__BB0_288;
add.s32 %r558, %r630, %r50;
mul.wide.s32 %rd229, %r558, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r162, %r630, 1;
setp.gt.u32 %p197, %r630, 3;
mov.u32 %r630, %r162;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r559, %r202, 1;
shr.u32 %r560, %r559, 31;
add.s32 %r561, %r559, %r560;
shr.s32 %r562, %r561, 1;
add.s32 %r563, %r4, %r562;
add.s32 %r564, %r563, -1;
div.s32 %r565, %r564, %r4;
setp.ge.s32 %p201, %r123, %r565;
@%p201 bra $L__BB0_296;
shl.b32 %r163, %r9, 1;
mul.lo.s32 %r566, %r4, %r123;
shl.b32 %r164, %r566, 1;
add.s32 %r567, %r163, %r164;
or.b32 %r568, %r567, 1;
setp.ge.s32 %p202, %r568, %r202;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_ee68fbc2_1033910nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r569, %r164, %r163;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r569, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r200, %r201}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r210, %r211}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r214, %r215}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r236, %r201, 7;
shr.s32 %r237, %r236, 31;
shr.u32 %r238, %r237, 29;
add.s32 %r239, %r236, %r238;
shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r240, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r242, %r4, %r2;
shl.b32 %r243, %r242, 4;
or.b32 %r244, %r243, 15;
and.b32 %r7, %r244, -16;
add.s32 %r245, %r244, %r7;
and.b32 %r246, %r245, -16;
cvt.s64.s32 %rd1, %r246;
shl.b32 %r247, %r4, 2;
max.s32 %r248, %r2, %r3;
mad.lo.s32 %r249, %r247, %r248, 15;
and.b32 %r250, %r249, -16;
cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r251, %r8, 7;
setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
// end inline asm
shl.b32 %r255, %r5, 4;
add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r254, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r254, 0;
cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r577, %r6, 4;
add.s32 %r256, %r4, 215;
div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r258, %r11, %r257;
add.s32 %r259, %r258, -1;
div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r261, %ctaid.y;
mul.lo.s32 %r262, %r12, %r4;
mul.lo.s32 %r13, %r262, %r261;
mad.lo.s32 %r263, %r2, %r9, %r5;
shl.b32 %r14, %r263, 4;
mul.lo.s32 %r264, %r201, %r9;
cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r265, %r13, %r201;
cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r266, %tid.z;
mad.lo.s32 %r267, %r4, %r266, %r9;
mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r268, %r3;
mov.u32 %r269, 31;
sub.s32 %r270, %r269, %r268;
mov.u32 %r271, 1;
shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r272, %r16, %r5;
setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r273, %r15, %r16;
mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r274, %r16, 31;
add.s32 %r275, %r16, %r274;
shr.s32 %r17, %r275, 1;
shl.b32 %r276, %r9, 3;
mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r281, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r574, %r4;
add.s32 %r279, %r22, %r9;
add.s32 %r23, %r279, %r13;
setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r572, %r574, %r4;
mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r571, %r574, %r4;
add.s32 %r570, %r571, %r9;
add.s32 %r569, %r570, %r13;
setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ cvt.f32.f16 %f221, %rs36;}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ cvt.f32.f16 %f222, %rs37;}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ cvt.f32.f16 %f223, %rs38;}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f224, %rs39;}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ cvt.f32.f16 %f225, %rs40;}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ cvt.f32.f16 %f226, %rs41;}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ cvt.f32.f16 %f227, %rs42;}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ cvt.f32.f16 %f228, %rs43;}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ cvt.f32.f16 %f229, %rs44;}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ cvt.f32.f16 %f230, %rs45;}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ cvt.f32.f16 %f231, %rs46;}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ cvt.f32.f16 %f232, %rs47;}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ cvt.f32.f16 %f233, %rs48;}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ cvt.f32.f16 %f234, %rs49;}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ cvt.f32.f16 %f235, %rs50;}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ cvt.f32.f16 %f236, %rs51;}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ cvt.f32.f16 %f237, %rs52;}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ cvt.f32.f16 %f238, %rs53;}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ cvt.f32.f16 %f239, %rs54;}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ cvt.f32.f16 %f240, %rs55;}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ cvt.f32.f16 %f241, %rs56;}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ cvt.f32.f16 %f242, %rs57;}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ cvt.f32.f16 %f243, %rs58;}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ cvt.f32.f16 %f244, %rs59;}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r575, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r35, %r575, 1;
setp.gt.u32 %p27, %r575, 3;
mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r576, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r37, %r576, 1;
setp.gt.u32 %p33, %r576, 3;
mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ cvt.f32.f16 %f338, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ cvt.f32.f16 %f339, %rs98;}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ cvt.f32.f16 %f340, %rs99;}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ cvt.f32.f16 %f342, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f343, %rs102;}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ cvt.f32.f16 %f344, %rs103;}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ cvt.f32.f16 %f346, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ cvt.f32.f16 %f347, %rs106;}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ cvt.f32.f16 %f348, %rs107;}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ cvt.f32.f16 %f350, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f351, %rs110;}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ cvt.f32.f16 %f352, %rs111;}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ cvt.f32.f16 %f354, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ cvt.f32.f16 %f355, %rs114;}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ cvt.f32.f16 %f356, %rs115;}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ cvt.f32.f16 %f358, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f359, %rs118;}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ cvt.f32.f16 %f360, %rs119;}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ cvt.f32.f16 %f362, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ cvt.f32.f16 %f363, %rs122;}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ cvt.f32.f16 %f364, %rs123;}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ cvt.f32.f16 %f366, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f367, %rs126;}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ cvt.f32.f16 %f368, %rs127;}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
mad.lo.s32 %r351, %r23, %r201, %r8;
mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r574, %r574, 1;
setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r352, %tid.z;
mad.lo.s32 %r353, %r4, %r352, %r9;
mad.lo.s32 %r49, %r353, %r3, %r5;
mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r354, %r4;
mov.u32 %r355, 31;
sub.s32 %r50, %r355, %r354;
mov.u32 %r356, 1;
shl.b32 %r608, %r356, %r50;
setp.lt.u32 %p40, %r9, %r608;
add.s32 %r357, %r608, %r9;
setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r358, %r3, %r50;
add.s32 %r359, %r49, %r358;
mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r578, %r608;
$L__BB0_48:
shr.u32 %r53, %r578, 1;
setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r360, %r53, %r3, %r49;
mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r578, 7;
mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r579, 0;
add.s32 %r362, %r49, %r3;
mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r363, %r3, %r50;
add.s32 %r364, %r49, %r363;
mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r580, %r608;
$L__BB0_59:
shr.u32 %r57, %r580, 1;
setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r365, %r57, %r3, %r49;
mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r580, 7;
mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r367, %r3, %r50;
add.s32 %r368, %r49, %r367;
mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r582, %r608;
$L__BB0_70:
shr.u32 %r61, %r582, 1;
setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r369, %r61, %r3, %r49;
mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r582, 7;
mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r371, %r3, %r50;
add.s32 %r372, %r49, %r371;
mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r584, %r608;
$L__BB0_81:
shr.u32 %r65, %r584, 1;
setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r373, %r65, %r3, %r49;
mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r584, 7;
mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r375, %r3, %r50;
add.s32 %r376, %r49, %r375;
mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r586, %r608;
$L__BB0_92:
shr.u32 %r69, %r586, 1;
setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r377, %r69, %r3, %r49;
mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r586, 7;
mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r379, %r3, %r50;
add.s32 %r380, %r49, %r379;
mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r588, %r608;
$L__BB0_103:
shr.u32 %r73, %r588, 1;
setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r381, %r73, %r3, %r49;
mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r588, 7;
mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r383, %r3, %r50;
add.s32 %r384, %r49, %r383;
mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r590, %r608;
$L__BB0_114:
shr.u32 %r77, %r590, 1;
setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r385, %r77, %r3, %r49;
mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r590, 7;
mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r387, %r3, %r50;
add.s32 %r388, %r49, %r387;
mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r592, %r608;
$L__BB0_125:
shr.u32 %r81, %r592, 1;
setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r389, %r81, %r3, %r49;
mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r592, 7;
mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r391, %r3, %r50;
add.s32 %r392, %r49, %r391;
mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r594, %r608;
$L__BB0_136:
shr.u32 %r86, %r594, 1;
setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r393, %r86, %r3, %r49;
mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r594, 7;
mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r395, %r3, %r50;
add.s32 %r396, %r49, %r395;
mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r596, %r608;
$L__BB0_147:
shr.u32 %r90, %r596, 1;
setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r397, %r90, %r3, %r49;
mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r596, 7;
mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r399, %r3, %r50;
add.s32 %r400, %r49, %r399;
mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r598, %r608;
$L__BB0_158:
shr.u32 %r94, %r598, 1;
setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r401, %r94, %r3, %r49;
mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r598, 7;
mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r403, %r3, %r50;
add.s32 %r404, %r49, %r403;
mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r600, %r608;
$L__BB0_169:
shr.u32 %r98, %r600, 1;
setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r405, %r98, %r3, %r49;
mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r600, 7;
mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r407, %r3, %r50;
add.s32 %r408, %r49, %r407;
mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r602, %r608;
$L__BB0_180:
shr.u32 %r102, %r602, 1;
setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r409, %r102, %r3, %r49;
mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r602, 7;
mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r411, %r3, %r50;
add.s32 %r412, %r49, %r411;
mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r604, %r608;
$L__BB0_191:
shr.u32 %r106, %r604, 1;
setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r413, %r106, %r3, %r49;
mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r604, 7;
mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r415, %r3, %r50;
add.s32 %r416, %r49, %r415;
mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r606, %r608;
$L__BB0_202:
shr.u32 %r110, %r606, 1;
setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r417, %r110, %r3, %r49;
mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r606, 7;
mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r419, %r3, %r50;
add.s32 %r420, %r49, %r419;
mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r114, %r608, 1;
setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r421, %r114, %r3, %r49;
mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r608, 7;
mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
mov.u32 %r447, %ctaid.y;
mad.lo.s32 %r448, %r201, %r447, %r8;
add.s32 %r449, %r448, %r84;
mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
// end inline asm
add.s32 %r450, %r449, 4;
mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r423, %r8, 3;
sub.s32 %r117, %r423, %r201;
mov.u32 %r424, %ctaid.y;
mad.lo.s32 %r118, %r201, %r424, %r8;
neg.s32 %r425, %r84;
setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r430, %r118, %r84;
mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
// end inline asm
$L__BB0_222:
mov.u32 %r431, -4;
sub.s32 %r432, %r431, %r84;
setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r437, %r118, %r84;
add.s32 %r438, %r437, 4;
mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
// end inline asm
$L__BB0_226:
shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
mov.u32 %r475, %ctaid.y;
mad.lo.s32 %r476, %r201, %r475, %r8;
add.s32 %r477, %r476, %r119;
mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
// end inline asm
add.s32 %r478, %r477, 4;
mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r451, %r8, 3;
sub.s32 %r120, %r451, %r201;
mov.u32 %r452, %ctaid.y;
mad.lo.s32 %r121, %r201, %r452, %r8;
neg.s32 %r453, %r119;
setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r458, %r121, %r119;
mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
// end inline asm
$L__BB0_229:
mov.u32 %r459, -4;
sub.s32 %r460, %r459, %r119;
setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r465, %r121, %r119;
add.s32 %r466, %r465, 4;
mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
// end inline asm
$L__BB0_233:
mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r479, %r5, %r9;
or.b32 %r481, %r479, %r352;
setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r482, %ctaid.x;
mov.u32 %r483, %ctaid.z;
mov.u32 %r484, %nctaid.x;
mad.lo.s32 %r485, %r483, %r484, %r482;
mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r486, %r11, -1;
setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r610, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r610;
// end inline asm
setp.lt.u32 %p159, %r610, 256;
selp.u32 %r489, 1, 0, %p159;
shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r490, %r11, %r3;
add.s32 %r491, %r490, -1;
div.s32 %r125, %r491, %r3;
setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r493, %r201, 1;
shr.u32 %r494, %r493, 31;
add.s32 %r495, %r493, %r494;
shr.s32 %r496, %r495, 1;
add.s32 %r497, %r4, %r496;
add.s32 %r498, %r497, -1;
shl.b32 %r499, %r9, 1;
shl.b32 %r500, %r4, 1;
mad.lo.s32 %r501, %r500, %r122, %r499;
or.b32 %r502, %r501, 1;
setp.ge.s32 %p162, %r502, %r201;
div.s32 %r503, %r498, %r4;
setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r504, %r4, %r122;
shl.b32 %r505, %r504, 1;
mad.lo.s32 %r506, %r201, %r5, %r505;
add.s32 %r612, %r506, %r499;
mul.lo.s32 %r127, %r201, %r3;
mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r611, %r5;
mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r611, %r11;
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
add.s32 %r612, %r612, %r127;
add.s32 %r611, %r611, %r3;
add.s32 %r613, %r613, 1;
setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r513, %r3;
mov.u32 %r514, 31;
sub.s32 %r515, %r514, %r513;
mov.u32 %r516, 1;
shl.b32 %r138, %r516, %r515;
setp.lt.u32 %p166, %r5, %r138;
add.s32 %r517, %r138, %r5;
setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r518, %r49, %r138;
mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r519, %r138, 31;
add.s32 %r520, %r138, %r519;
shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r616, %r624;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
add.s32 %r521, %r616, %r49;
mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r141, %r616, 1;
setp.gt.u32 %p171, %r616, 3;
mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r522, %r49, 1;
mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r617, %r624;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
add.s32 %r523, %r617, %r49;
mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r143, %r617, 1;
setp.gt.u32 %p177, %r617, 3;
mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r524, %r201, 1;
shr.u32 %r525, %r524, 31;
add.s32 %r526, %r524, %r525;
shr.s32 %r527, %r526, 1;
add.s32 %r528, %r4, %r527;
add.s32 %r529, %r528, -1;
div.s32 %r530, %r529, %r4;
setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
shl.b32 %r144, %r9, 1;
mul.lo.s32 %r531, %r4, %r122;
shl.b32 %r145, %r531, 1;
add.s32 %r532, %r144, %r145;
or.b32 %r533, %r532, 1;
setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r536, %r201, 1;
shr.u32 %r537, %r536, 31;
add.s32 %r538, %r536, %r537;
shr.s32 %r539, %r538, 1;
add.s32 %r540, %r4, %r539;
add.s32 %r541, %r540, -1;
shl.b32 %r542, %r9, 1;
shl.b32 %r543, %r4, 1;
mad.lo.s32 %r544, %r543, %r122, %r542;
or.b32 %r545, %r544, 1;
setp.ge.s32 %p184, %r545, %r201;
div.s32 %r546, %r541, %r4;
setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r547, %r4, %r122;
shl.b32 %r548, %r547, 1;
mad.lo.s32 %r549, %r201, %r5, %r548;
add.s32 %r619, %r549, %r542;
mul.lo.s32 %r147, %r201, %r3;
mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r618, %r5;
mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r618, %r11;
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
add.s32 %r619, %r619, %r147;
add.s32 %r618, %r618, %r3;
add.s32 %r620, %r620, 1;
setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r623, %r624;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
add.s32 %r556, %r623, %r49;
mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r159, %r623, 1;
setp.gt.u32 %p191, %r623, 3;
mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
add.s32 %r557, %r624, %r49;
mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r161, %r624, 1;
setp.gt.u32 %p197, %r624, 3;
mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r558, %r201, 1;
shr.u32 %r559, %r558, 31;
add.s32 %r560, %r558, %r559;
shr.s32 %r561, %r560, 1;
add.s32 %r562, %r4, %r561;
add.s32 %r563, %r562, -1;
div.s32 %r564, %r563, %r4;
setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
shl.b32 %r162, %r9, 1;
mul.lo.s32 %r565, %r4, %r122;
shl.b32 %r163, %r565, 1;
add.s32 %r566, %r162, %r163;
or.b32 %r567, %r566, 1;
setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_29_cu_0c844885_723310nvfuser_29ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,173 +32,173 @@
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
- .reg .b32 %r<631>;
+ .reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r200, %r201}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r210, %r211}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r214, %r215}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r237, %r202, 7;
- shr.s32 %r238, %r237, 31;
- shr.u32 %r239, %r238, 29;
- add.s32 %r240, %r237, %r239;
- shr.s32 %r2, %r240, 3;
+ add.s32 %r236, %r201, 7;
+ shr.s32 %r237, %r236, 31;
+ shr.u32 %r238, %r237, 29;
+ add.s32 %r239, %r236, %r238;
+ shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
- mov.u32 %r241, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
+ mov.u32 %r240, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r242, [%rd42], %r5;
+ atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r243, %r4, %r2;
- shl.b32 %r244, %r243, 4;
- or.b32 %r245, %r244, 15;
- and.b32 %r7, %r245, -16;
- add.s32 %r246, %r245, %r7;
- and.b32 %r247, %r246, -16;
- cvt.s64.s32 %rd1, %r247;
- shl.b32 %r248, %r4, 2;
- max.s32 %r249, %r2, %r3;
- mad.lo.s32 %r250, %r248, %r249, 15;
- and.b32 %r251, %r250, -16;
- cvt.u64.u32 %rd2, %r251;
+ mul.lo.s32 %r242, %r4, %r2;
+ shl.b32 %r243, %r242, 4;
+ or.b32 %r244, %r243, 15;
+ and.b32 %r7, %r244, -16;
+ add.s32 %r245, %r244, %r7;
+ and.b32 %r246, %r245, -16;
+ cvt.s64.s32 %rd1, %r246;
+ shl.b32 %r247, %r4, 2;
+ max.s32 %r248, %r2, %r3;
+ mad.lo.s32 %r249, %r247, %r248, 15;
+ and.b32 %r250, %r249, -16;
+ cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r252, %r8, 7;
- setp.lt.s32 %p11, %r252, %r202;
+ or.b32 %r251, %r8, 7;
+ setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
-
-
- shl.b32 %r256, %r5, 4;
- add.s32 %r254, %r253, %r256;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
+
+
+ shl.b32 %r255, %r5, 4;
+ add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
- mov.u32 %r255, 0;
+ mov.u32 %r254, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r255, 0;
- cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r254, 0;
+ cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r583, %r6, 4;
- add.s32 %r257, %r4, 215;
- div.s32 %r258, %r257, %r4;
+ shl.b32 %r577, %r6, 4;
+ add.s32 %r256, %r4, 215;
+ div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r259, %r11, %r258;
- add.s32 %r260, %r259, -1;
- div.s32 %r12, %r260, %r11;
+ add.s32 %r258, %r11, %r257;
+ add.s32 %r259, %r258, -1;
+ div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r202;
+ cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
- mov.u32 %r262, %ctaid.y;
- mul.lo.s32 %r263, %r12, %r4;
- mul.lo.s32 %r13, %r263, %r262;
- shl.b32 %r264, %r9, 1;
- mov.u32 %r265, 1;
- shl.b32 %r266, %r5, 4;
- mad.lo.s32 %r14, %r264, %r202, %r266;
- mul.lo.s32 %r267, %r202, %r9;
- cvt.s64.s32 %rd52, %r267;
+ mov.u32 %r261, %ctaid.y;
+ mul.lo.s32 %r262, %r12, %r4;
+ mul.lo.s32 %r13, %r262, %r261;
+ mad.lo.s32 %r263, %r2, %r9, %r5;
+ shl.b32 %r14, %r263, 4;
+ mul.lo.s32 %r264, %r201, %r9;
+ cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r268, %r13, %r202;
- cvt.s64.s32 %rd6, %r268;
+ mul.lo.s32 %r265, %r13, %r201;
+ cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- mov.u32 %r269, %tid.z;
- mad.lo.s32 %r270, %r4, %r269, %r9;
- mad.lo.s32 %r15, %r270, %r3, %r5;
+ mov.u32 %r266, %tid.z;
+ mad.lo.s32 %r267, %r4, %r266, %r9;
+ mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
- clz.b32 %r271, %r3;
- mov.u32 %r272, 31;
- sub.s32 %r273, %r272, %r271;
- shl.b32 %r16, %r265, %r273;
+ clz.b32 %r268, %r3;
+ mov.u32 %r269, 31;
+ sub.s32 %r270, %r269, %r268;
+ mov.u32 %r271, 1;
+ shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
- add.s32 %r274, %r16, %r5;
- setp.lt.u32 %p18, %r274, %r3;
+ add.s32 %r272, %r16, %r5;
+ setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
- add.s32 %r275, %r15, %r16;
- mul.wide.s32 %rd55, %r275, 4;
+ add.s32 %r273, %r15, %r16;
+ mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
- shr.u32 %r276, %r16, 31;
- add.s32 %r277, %r16, %r276;
- shr.s32 %r17, %r277, 1;
- add.s32 %r18, %r267, %r8;
+ shr.u32 %r274, %r16, 31;
+ add.s32 %r275, %r16, %r274;
+ shr.s32 %r17, %r275, 1;
+ shl.b32 %r276, %r9, 3;
+ mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
- mul.wide.s32 %rd57, %r18, 2;
+ mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
- mul.wide.s32 %rd61, %r270, 4;
+ mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
- mov.u32 %r580, 0;
+ mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
- add.s32 %r282, %r14, %r281;
+ add.s32 %r282, %r281, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
- add.s32 %r285, %r14, %r284;
+ add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
@@ -215,30 +215,30 @@
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r580, %r4;
- add.s32 %r279, %r23, %r9;
- add.s32 %r24, %r279, %r13;
- setp.gt.s32 %p19, %r24, 215;
+ mul.lo.s32 %r22, %r574, %r4;
+ add.s32 %r279, %r22, %r9;
+ add.s32 %r23, %r279, %r13;
+ setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
- mul.lo.s32 %r280, %r24, %r211;
+ mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p20, %r24, 216;
+ setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
- mul.lo.s32 %r578, %r580, %r4;
- mul.lo.s32 %r287, %r578, %r202;
+ mul.lo.s32 %r572, %r574, %r4;
+ mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
@@ -260,19 +260,19 @@
}
$L__BB0_11:
- mul.lo.s32 %r576, %r580, %r4;
- add.s32 %r575, %r576, %r9;
- add.s32 %r574, %r575, %r13;
- setp.gt.s32 %p204, %r574, 215;
+ mul.lo.s32 %r571, %r574, %r4;
+ add.s32 %r570, %r571, %r9;
+ add.s32 %r569, %r570, %r13;
+ setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
- mul.lo.s32 %r288, %r24, %r215;
+ mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
@@ -465,11 +465,11 @@
mov.f32 %f641, %f640;
$L__BB0_16:
- shl.b32 %r583, %r583, 2;
+ shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
@@ -480,29 +480,29 @@
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
- mov.u32 %r581, %r17;
+ mov.u32 %r575, %r17;
$L__BB0_20:
- setp.ge.u32 %p26, %r5, %r581;
+ setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
- add.s32 %r317, %r581, %r15;
+ add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
- shr.u32 %r36, %r581, 1;
- setp.gt.u32 %p27, %r581, 3;
- mov.u32 %r581, %r36;
+ shr.u32 %r35, %r575, 1;
+ setp.gt.u32 %p27, %r575, 3;
+ mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
@@ -529,29 +529,29 @@
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
- mov.u32 %r582, %r17;
+ mov.u32 %r576, %r17;
$L__BB0_30:
- setp.ge.u32 %p32, %r5, %r582;
+ setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
- add.s32 %r318, %r582, %r15;
+ add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
- shr.u32 %r38, %r582, 1;
- setp.gt.u32 %p33, %r582, 3;
- mov.u32 %r582, %r38;
+ shr.u32 %r37, %r576, 1;
+ setp.gt.u32 %p33, %r576, 3;
+ mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
@@ -589,11 +589,10 @@
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
- mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
@@ -801,13 +800,12 @@
{ cvt.rn.f16.f32 %rs124, %f365;}
mov.b32 %r326, {%rs124, %rs128};
- add.s32 %r351, %r13, %r577;
- mad.lo.s32 %r352, %r351, %r202, %r18;
- mul.wide.s32 %rd82, %r352, 2;
+ mad.lo.s32 %r351, %r23, %r201, %r8;
+ mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
bra.uni $L__BB0_43;
@@ -817,12 +815,12 @@
{ cvt.rn.f16.f32 %rs61, %f301;}
$L__BB0_43:
- add.s32 %r580, %r580, 1;
- setp.lt.s32 %p39, %r580, %r12;
+ add.s32 %r574, %r574, 1;
+ setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
@@ -841,68 +839,68 @@
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
- mov.u32 %r353, %tid.z;
- mad.lo.s32 %r354, %r4, %r353, %r9;
- mad.lo.s32 %r50, %r354, %r3, %r5;
- mul.wide.u32 %rd83, %r50, 4;
+ mov.u32 %r352, %tid.z;
+ mad.lo.s32 %r353, %r4, %r352, %r9;
+ mad.lo.s32 %r49, %r353, %r3, %r5;
+ mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
- clz.b32 %r355, %r4;
- mov.u32 %r356, 31;
- sub.s32 %r51, %r356, %r355;
- mov.u32 %r357, 1;
- shl.b32 %r614, %r357, %r51;
- setp.lt.u32 %p40, %r9, %r614;
- add.s32 %r358, %r614, %r9;
- setp.lt.u32 %p41, %r358, %r4;
+ clz.b32 %r354, %r4;
+ mov.u32 %r355, 31;
+ sub.s32 %r50, %r355, %r354;
+ mov.u32 %r356, 1;
+ shl.b32 %r608, %r356, %r50;
+ setp.lt.u32 %p40, %r9, %r608;
+ add.s32 %r357, %r608, %r9;
+ setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
- shl.b32 %r359, %r3, %r51;
- add.s32 %r360, %r50, %r359;
- mul.wide.s32 %rd85, %r360, 4;
+ shl.b32 %r358, %r3, %r50;
+ add.s32 %r359, %r49, %r358;
+ mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
- setp.lt.s32 %p43, %r614, 4;
+ setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
- mov.u32 %r584, %r614;
+ mov.u32 %r578, %r608;
$L__BB0_48:
- shr.u32 %r54, %r584, 1;
- setp.ge.u32 %p44, %r9, %r54;
+ shr.u32 %r53, %r578, 1;
+ setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
- mad.lo.s32 %r361, %r54, %r3, %r50;
- mul.wide.s32 %rd88, %r361, 4;
+ mad.lo.s32 %r360, %r53, %r3, %r49;
+ mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
- setp.gt.u32 %p45, %r584, 7;
- mov.u32 %r584, %r54;
+ setp.gt.u32 %p45, %r578, 7;
+ mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
- mov.u32 %r585, 0;
- add.s32 %r363, %r50, %r3;
- mul.wide.u32 %rd91, %r363, 4;
+ mov.u32 %r579, 0;
+ add.s32 %r362, %r49, %r3;
+ mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
@@ -911,54 +909,54 @@
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
- mov.b32 %r585, %f660;
+ mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
- shl.b32 %r364, %r3, %r51;
- add.s32 %r365, %r50, %r364;
- mul.wide.s32 %rd93, %r365, 4;
+ shl.b32 %r363, %r3, %r50;
+ add.s32 %r364, %r49, %r363;
+ mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
- mov.u32 %r586, %r614;
+ mov.u32 %r580, %r608;
$L__BB0_59:
- shr.u32 %r58, %r586, 1;
- setp.ge.u32 %p50, %r9, %r58;
+ shr.u32 %r57, %r580, 1;
+ setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
- mad.lo.s32 %r366, %r58, %r3, %r50;
- mul.wide.s32 %rd96, %r366, 4;
+ mad.lo.s32 %r365, %r57, %r3, %r49;
+ mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
- setp.gt.u32 %p51, %r586, 7;
- mov.u32 %r586, %r58;
+ setp.gt.u32 %p51, %r580, 7;
+ mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
- mov.u32 %r587, 0;
+ mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@@ -966,54 +964,54 @@
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
- mov.b32 %r587, %f661;
+ mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
- shl.b32 %r368, %r3, %r51;
- add.s32 %r369, %r50, %r368;
- mul.wide.s32 %rd99, %r369, 4;
+ shl.b32 %r367, %r3, %r50;
+ add.s32 %r368, %r49, %r367;
+ mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
- mov.u32 %r588, %r614;
+ mov.u32 %r582, %r608;
$L__BB0_70:
- shr.u32 %r62, %r588, 1;
- setp.ge.u32 %p56, %r9, %r62;
+ shr.u32 %r61, %r582, 1;
+ setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
- mad.lo.s32 %r370, %r62, %r3, %r50;
- mul.wide.s32 %rd102, %r370, 4;
+ mad.lo.s32 %r369, %r61, %r3, %r49;
+ mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
- setp.gt.u32 %p57, %r588, 7;
- mov.u32 %r588, %r62;
+ setp.gt.u32 %p57, %r582, 7;
+ mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
- mov.u32 %r589, 0;
+ mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@@ -1021,54 +1019,54 @@
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
- mov.b32 %r589, %f662;
+ mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
- shl.b32 %r372, %r3, %r51;
- add.s32 %r373, %r50, %r372;
- mul.wide.s32 %rd105, %r373, 4;
+ shl.b32 %r371, %r3, %r50;
+ add.s32 %r372, %r49, %r371;
+ mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
- mov.u32 %r590, %r614;
+ mov.u32 %r584, %r608;
$L__BB0_81:
- shr.u32 %r66, %r590, 1;
- setp.ge.u32 %p62, %r9, %r66;
+ shr.u32 %r65, %r584, 1;
+ setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
- mad.lo.s32 %r374, %r66, %r3, %r50;
- mul.wide.s32 %rd108, %r374, 4;
+ mad.lo.s32 %r373, %r65, %r3, %r49;
+ mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
- setp.gt.u32 %p63, %r590, 7;
- mov.u32 %r590, %r66;
+ setp.gt.u32 %p63, %r584, 7;
+ mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
- mov.u32 %r591, 0;
+ mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@@ -1076,54 +1074,54 @@
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
- mov.b32 %r591, %f663;
+ mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
- shl.b32 %r376, %r3, %r51;
- add.s32 %r377, %r50, %r376;
- mul.wide.s32 %rd111, %r377, 4;
+ shl.b32 %r375, %r3, %r50;
+ add.s32 %r376, %r49, %r375;
+ mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
- mov.u32 %r592, %r614;
+ mov.u32 %r586, %r608;
$L__BB0_92:
- shr.u32 %r70, %r592, 1;
- setp.ge.u32 %p68, %r9, %r70;
+ shr.u32 %r69, %r586, 1;
+ setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
- mad.lo.s32 %r378, %r70, %r3, %r50;
- mul.wide.s32 %rd114, %r378, 4;
+ mad.lo.s32 %r377, %r69, %r3, %r49;
+ mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
- setp.gt.u32 %p69, %r592, 7;
- mov.u32 %r592, %r70;
+ setp.gt.u32 %p69, %r586, 7;
+ mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
- mov.u32 %r593, 0;
+ mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@@ -1131,54 +1129,54 @@
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
- mov.b32 %r593, %f664;
+ mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
- shl.b32 %r380, %r3, %r51;
- add.s32 %r381, %r50, %r380;
- mul.wide.s32 %rd117, %r381, 4;
+ shl.b32 %r379, %r3, %r50;
+ add.s32 %r380, %r49, %r379;
+ mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
- mov.u32 %r594, %r614;
+ mov.u32 %r588, %r608;
$L__BB0_103:
- shr.u32 %r74, %r594, 1;
- setp.ge.u32 %p74, %r9, %r74;
+ shr.u32 %r73, %r588, 1;
+ setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
- mad.lo.s32 %r382, %r74, %r3, %r50;
- mul.wide.s32 %rd120, %r382, 4;
+ mad.lo.s32 %r381, %r73, %r3, %r49;
+ mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
- setp.gt.u32 %p75, %r594, 7;
- mov.u32 %r594, %r74;
+ setp.gt.u32 %p75, %r588, 7;
+ mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
- mov.u32 %r595, 0;
+ mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@@ -1186,54 +1184,54 @@
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
- mov.b32 %r595, %f665;
+ mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
- shl.b32 %r384, %r3, %r51;
- add.s32 %r385, %r50, %r384;
- mul.wide.s32 %rd123, %r385, 4;
+ shl.b32 %r383, %r3, %r50;
+ add.s32 %r384, %r49, %r383;
+ mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
- mov.u32 %r596, %r614;
+ mov.u32 %r590, %r608;
$L__BB0_114:
- shr.u32 %r78, %r596, 1;
- setp.ge.u32 %p80, %r9, %r78;
+ shr.u32 %r77, %r590, 1;
+ setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
- mad.lo.s32 %r386, %r78, %r3, %r50;
- mul.wide.s32 %rd126, %r386, 4;
+ mad.lo.s32 %r385, %r77, %r3, %r49;
+ mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
- setp.gt.u32 %p81, %r596, 7;
- mov.u32 %r596, %r78;
+ setp.gt.u32 %p81, %r590, 7;
+ mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
- mov.u32 %r597, 0;
+ mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@@ -1241,54 +1239,54 @@
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
- mov.b32 %r597, %f666;
+ mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
- shl.b32 %r388, %r3, %r51;
- add.s32 %r389, %r50, %r388;
- mul.wide.s32 %rd129, %r389, 4;
+ shl.b32 %r387, %r3, %r50;
+ add.s32 %r388, %r49, %r387;
+ mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
- mov.u32 %r598, %r614;
+ mov.u32 %r592, %r608;
$L__BB0_125:
- shr.u32 %r82, %r598, 1;
- setp.ge.u32 %p86, %r9, %r82;
+ shr.u32 %r81, %r592, 1;
+ setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
- mad.lo.s32 %r390, %r82, %r3, %r50;
- mul.wide.s32 %rd132, %r390, 4;
+ mad.lo.s32 %r389, %r81, %r3, %r49;
+ mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
- setp.gt.u32 %p87, %r598, 7;
- mov.u32 %r598, %r82;
+ setp.gt.u32 %p87, %r592, 7;
+ mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
- mov.u32 %r599, 0;
+ mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@@ -1296,55 +1294,55 @@
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
- mov.b32 %r599, %f667;
+ mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
- shl.b32 %r85, %r583, 4;
+ shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
- shl.b32 %r392, %r3, %r51;
- add.s32 %r393, %r50, %r392;
- mul.wide.s32 %rd135, %r393, 4;
+ shl.b32 %r391, %r3, %r50;
+ add.s32 %r392, %r49, %r391;
+ mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
- mov.u32 %r600, %r614;
+ mov.u32 %r594, %r608;
$L__BB0_136:
- shr.u32 %r87, %r600, 1;
- setp.ge.u32 %p92, %r9, %r87;
+ shr.u32 %r86, %r594, 1;
+ setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
- mad.lo.s32 %r394, %r87, %r3, %r50;
- mul.wide.s32 %rd138, %r394, 4;
+ mad.lo.s32 %r393, %r86, %r3, %r49;
+ mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
- setp.gt.u32 %p93, %r600, 7;
- mov.u32 %r600, %r87;
+ setp.gt.u32 %p93, %r594, 7;
+ mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
- mov.u32 %r601, 0;
+ mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@@ -1352,54 +1350,54 @@
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
- mov.b32 %r601, %f668;
+ mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
- shl.b32 %r396, %r3, %r51;
- add.s32 %r397, %r50, %r396;
- mul.wide.s32 %rd141, %r397, 4;
+ shl.b32 %r395, %r3, %r50;
+ add.s32 %r396, %r49, %r395;
+ mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
- mov.u32 %r602, %r614;
+ mov.u32 %r596, %r608;
$L__BB0_147:
- shr.u32 %r91, %r602, 1;
- setp.ge.u32 %p98, %r9, %r91;
+ shr.u32 %r90, %r596, 1;
+ setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
- mad.lo.s32 %r398, %r91, %r3, %r50;
- mul.wide.s32 %rd144, %r398, 4;
+ mad.lo.s32 %r397, %r90, %r3, %r49;
+ mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
- setp.gt.u32 %p99, %r602, 7;
- mov.u32 %r602, %r91;
+ setp.gt.u32 %p99, %r596, 7;
+ mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
- mov.u32 %r603, 0;
+ mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@@ -1407,54 +1405,54 @@
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
- mov.b32 %r603, %f669;
+ mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
- shl.b32 %r400, %r3, %r51;
- add.s32 %r401, %r50, %r400;
- mul.wide.s32 %rd147, %r401, 4;
+ shl.b32 %r399, %r3, %r50;
+ add.s32 %r400, %r49, %r399;
+ mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
- mov.u32 %r604, %r614;
+ mov.u32 %r598, %r608;
$L__BB0_158:
- shr.u32 %r95, %r604, 1;
- setp.ge.u32 %p104, %r9, %r95;
+ shr.u32 %r94, %r598, 1;
+ setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
- mad.lo.s32 %r402, %r95, %r3, %r50;
- mul.wide.s32 %rd150, %r402, 4;
+ mad.lo.s32 %r401, %r94, %r3, %r49;
+ mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
- setp.gt.u32 %p105, %r604, 7;
- mov.u32 %r604, %r95;
+ setp.gt.u32 %p105, %r598, 7;
+ mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
- mov.u32 %r605, 0;
+ mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@@ -1462,54 +1460,54 @@
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
- mov.b32 %r605, %f670;
+ mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
- shl.b32 %r404, %r3, %r51;
- add.s32 %r405, %r50, %r404;
- mul.wide.s32 %rd153, %r405, 4;
+ shl.b32 %r403, %r3, %r50;
+ add.s32 %r404, %r49, %r403;
+ mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
- mov.u32 %r606, %r614;
+ mov.u32 %r600, %r608;
$L__BB0_169:
- shr.u32 %r99, %r606, 1;
- setp.ge.u32 %p110, %r9, %r99;
+ shr.u32 %r98, %r600, 1;
+ setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
- mad.lo.s32 %r406, %r99, %r3, %r50;
- mul.wide.s32 %rd156, %r406, 4;
+ mad.lo.s32 %r405, %r98, %r3, %r49;
+ mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
- setp.gt.u32 %p111, %r606, 7;
- mov.u32 %r606, %r99;
+ setp.gt.u32 %p111, %r600, 7;
+ mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
- mov.u32 %r607, 0;
+ mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@@ -1517,54 +1515,54 @@
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
- mov.b32 %r607, %f671;
+ mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
- shl.b32 %r408, %r3, %r51;
- add.s32 %r409, %r50, %r408;
- mul.wide.s32 %rd159, %r409, 4;
+ shl.b32 %r407, %r3, %r50;
+ add.s32 %r408, %r49, %r407;
+ mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
- mov.u32 %r608, %r614;
+ mov.u32 %r602, %r608;
$L__BB0_180:
- shr.u32 %r103, %r608, 1;
- setp.ge.u32 %p116, %r9, %r103;
+ shr.u32 %r102, %r602, 1;
+ setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
- mad.lo.s32 %r410, %r103, %r3, %r50;
- mul.wide.s32 %rd162, %r410, 4;
+ mad.lo.s32 %r409, %r102, %r3, %r49;
+ mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
- setp.gt.u32 %p117, %r608, 7;
- mov.u32 %r608, %r103;
+ setp.gt.u32 %p117, %r602, 7;
+ mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
- mov.u32 %r609, 0;
+ mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@@ -1572,54 +1570,54 @@
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
- mov.b32 %r609, %f672;
+ mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
- shl.b32 %r412, %r3, %r51;
- add.s32 %r413, %r50, %r412;
- mul.wide.s32 %rd165, %r413, 4;
+ shl.b32 %r411, %r3, %r50;
+ add.s32 %r412, %r49, %r411;
+ mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
- mov.u32 %r610, %r614;
+ mov.u32 %r604, %r608;
$L__BB0_191:
- shr.u32 %r107, %r610, 1;
- setp.ge.u32 %p122, %r9, %r107;
+ shr.u32 %r106, %r604, 1;
+ setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
- mad.lo.s32 %r414, %r107, %r3, %r50;
- mul.wide.s32 %rd168, %r414, 4;
+ mad.lo.s32 %r413, %r106, %r3, %r49;
+ mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
- setp.gt.u32 %p123, %r610, 7;
- mov.u32 %r610, %r107;
+ setp.gt.u32 %p123, %r604, 7;
+ mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
- mov.u32 %r611, 0;
+ mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@@ -1627,54 +1625,54 @@
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
- mov.b32 %r611, %f673;
+ mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
- shl.b32 %r416, %r3, %r51;
- add.s32 %r417, %r50, %r416;
- mul.wide.s32 %rd171, %r417, 4;
+ shl.b32 %r415, %r3, %r50;
+ add.s32 %r416, %r49, %r415;
+ mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
- mov.u32 %r612, %r614;
+ mov.u32 %r606, %r608;
$L__BB0_202:
- shr.u32 %r111, %r612, 1;
- setp.ge.u32 %p128, %r9, %r111;
+ shr.u32 %r110, %r606, 1;
+ setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
- mad.lo.s32 %r418, %r111, %r3, %r50;
- mul.wide.s32 %rd174, %r418, 4;
+ mad.lo.s32 %r417, %r110, %r3, %r49;
+ mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
- setp.gt.u32 %p129, %r612, 7;
- mov.u32 %r612, %r111;
+ setp.gt.u32 %p129, %r606, 7;
+ mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
- mov.u32 %r613, 0;
+ mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@@ -1682,21 +1680,21 @@
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
- mov.b32 %r613, %f674;
+ mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
- shl.b32 %r420, %r3, %r51;
- add.s32 %r421, %r50, %r420;
- mul.wide.s32 %rd177, %r421, 4;
+ shl.b32 %r419, %r3, %r50;
+ add.s32 %r420, %r49, %r419;
+ mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
@@ -1704,30 +1702,30 @@
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
- shr.u32 %r115, %r614, 1;
- setp.ge.u32 %p134, %r9, %r115;
+ shr.u32 %r114, %r608, 1;
+ setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
- mad.lo.s32 %r422, %r115, %r3, %r50;
- mul.wide.s32 %rd180, %r422, 4;
+ mad.lo.s32 %r421, %r114, %r3, %r49;
+ mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
- setp.gt.u32 %p135, %r614, 7;
- mov.u32 %r614, %r115;
+ setp.gt.u32 %p135, %r608, 7;
+ mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
- mov.u32 %r615, 0;
+ mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@@ -1735,255 +1733,251 @@
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
- mov.b32 %r615, %f675;
+ mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
- shl.b32 %r573, %r5, 3;
- mov.u32 %r448, %ctaid.y;
- mad.lo.s32 %r449, %r202, %r448, %r573;
- add.s32 %r450, %r449, %r85;
- mul.wide.s32 %rd189, %r450, 4;
+ mov.u32 %r447, %ctaid.y;
+ mad.lo.s32 %r448, %r201, %r447, %r8;
+ add.s32 %r449, %r448, %r84;
+ mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
- st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
-
- add.s32 %r451, %r450, 4;
- mul.wide.s32 %rd190, %r451, 4;
+ st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
+
+ add.s32 %r450, %r449, 4;
+ mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
- st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
bra.uni $L__BB0_226;
$L__BB0_220:
- shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
- add.s32 %r424, %r570, 3;
- sub.s32 %r118, %r424, %r202;
- mov.u32 %r425, %ctaid.y;
- mad.lo.s32 %r119, %r202, %r425, %r570;
- neg.s32 %r426, %r85;
- setp.ge.s32 %p141, %r118, %r426;
+ add.s32 %r423, %r8, 3;
+ sub.s32 %r117, %r423, %r201;
+ mov.u32 %r424, %ctaid.y;
+ mad.lo.s32 %r118, %r201, %r424, %r8;
+ neg.s32 %r425, %r84;
+ setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
- add.s32 %r431, %r119, %r85;
- mul.wide.s32 %rd184, %r431, 4;
+ add.s32 %r430, %r118, %r84;
+ mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
- st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
+ st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
$L__BB0_222:
- mov.u32 %r432, -4;
- sub.s32 %r433, %r432, %r85;
- setp.ge.s32 %p143, %r118, %r433;
+ mov.u32 %r431, -4;
+ sub.s32 %r432, %r431, %r84;
+ setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
- add.s32 %r438, %r119, %r85;
- add.s32 %r439, %r438, 4;
- mul.wide.s32 %rd186, %r439, 4;
+ add.s32 %r437, %r118, %r84;
+ add.s32 %r438, %r437, 4;
+ mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
$L__BB0_226:
- shl.b32 %r120, %r583, 5;
+ shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
- shl.b32 %r572, %r5, 3;
- mov.u32 %r476, %ctaid.y;
- mad.lo.s32 %r477, %r202, %r476, %r572;
- add.s32 %r478, %r477, %r120;
- mul.wide.s32 %rd197, %r478, 4;
+ mov.u32 %r475, %ctaid.y;
+ mad.lo.s32 %r476, %r201, %r475, %r8;
+ add.s32 %r477, %r476, %r119;
+ mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
- st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
-
- add.s32 %r479, %r478, 4;
- mul.wide.s32 %rd198, %r479, 4;
+ st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
+
+ add.s32 %r478, %r477, 4;
+ mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
- st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
bra.uni $L__BB0_233;
$L__BB0_227:
- shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
- add.s32 %r452, %r571, 3;
- sub.s32 %r121, %r452, %r202;
- mov.u32 %r453, %ctaid.y;
- mad.lo.s32 %r122, %r202, %r453, %r571;
- neg.s32 %r454, %r120;
- setp.ge.s32 %p150, %r121, %r454;
+ add.s32 %r451, %r8, 3;
+ sub.s32 %r120, %r451, %r201;
+ mov.u32 %r452, %ctaid.y;
+ mad.lo.s32 %r121, %r201, %r452, %r8;
+ neg.s32 %r453, %r119;
+ setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
- add.s32 %r459, %r122, %r120;
- mul.wide.s32 %rd192, %r459, 4;
+ add.s32 %r458, %r121, %r119;
+ mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
- st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
+ st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
$L__BB0_229:
- mov.u32 %r460, -4;
- sub.s32 %r461, %r460, %r120;
- setp.ge.s32 %p152, %r121, %r461;
+ mov.u32 %r459, -4;
+ sub.s32 %r460, %r459, %r119;
+ setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
- add.s32 %r466, %r122, %r120;
- add.s32 %r467, %r466, 4;
- mul.wide.s32 %rd194, %r467, 4;
+ add.s32 %r465, %r121, %r119;
+ add.s32 %r466, %r465, 4;
+ mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
- st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
$L__BB0_233:
- mov.u32 %r123, %ctaid.y;
+ mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r480, %r5, %r9;
- or.b32 %r482, %r480, %r353;
- setp.ne.s32 %p156, %r482, 0;
+ or.b32 %r479, %r5, %r9;
+ or.b32 %r481, %r479, %r352;
+ setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
- mov.u32 %r483, %ctaid.x;
- mov.u32 %r484, %ctaid.z;
- mov.u32 %r485, %nctaid.x;
- mad.lo.s32 %r486, %r484, %r485, %r483;
- mul.wide.s32 %rd200, %r486, 8;
+ mov.u32 %r482, %ctaid.x;
+ mov.u32 %r483, %ctaid.z;
+ mov.u32 %r484, %nctaid.x;
+ mad.lo.s32 %r485, %r483, %r484, %r482;
+ mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
- add.s32 %r487, %r11, -1;
- setp.eq.s32 %p157, %r123, %r487;
+ add.s32 %r486, %r11, -1;
+ setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
- mov.u32 %r616, 8;
+ mov.u32 %r610, 8;
$L__BB0_236:
- nanosleep.u32 %r616;
-
- setp.lt.u32 %p159, %r616, 256;
- selp.u32 %r490, 1, 0, %p159;
- shl.b32 %r616, %r616, %r490;
+ nanosleep.u32 %r610;
+
+ setp.lt.u32 %p159, %r610, 256;
+ selp.u32 %r489, 1, 0, %p159;
+ shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
- add.s32 %r491, %r11, %r3;
- add.s32 %r492, %r491, -1;
- div.s32 %r126, %r492, %r3;
- setp.lt.s32 %p161, %r126, 1;
+ add.s32 %r490, %r11, %r3;
+ add.s32 %r491, %r490, -1;
+ div.s32 %r125, %r491, %r3;
+ setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
- add.s32 %r494, %r202, 1;
- shr.u32 %r495, %r494, 31;
- add.s32 %r496, %r494, %r495;
- shr.s32 %r497, %r496, 1;
- add.s32 %r498, %r4, %r497;
- add.s32 %r499, %r498, -1;
- shl.b32 %r500, %r9, 1;
- shl.b32 %r501, %r4, 1;
- mad.lo.s32 %r502, %r501, %r123, %r500;
- or.b32 %r503, %r502, 1;
- setp.ge.s32 %p162, %r503, %r202;
- div.s32 %r504, %r499, %r4;
- setp.ge.s32 %p163, %r123, %r504;
+ add.s32 %r493, %r201, 1;
+ shr.u32 %r494, %r493, 31;
+ add.s32 %r495, %r493, %r494;
+ shr.s32 %r496, %r495, 1;
+ add.s32 %r497, %r4, %r496;
+ add.s32 %r498, %r497, -1;
+ shl.b32 %r499, %r9, 1;
+ shl.b32 %r500, %r4, 1;
+ mad.lo.s32 %r501, %r500, %r122, %r499;
+ or.b32 %r502, %r501, 1;
+ setp.ge.s32 %p162, %r502, %r201;
+ div.s32 %r503, %r498, %r4;
+ setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
- mul.lo.s32 %r505, %r4, %r123;
- shl.b32 %r506, %r505, 1;
- mad.lo.s32 %r507, %r202, %r5, %r506;
- add.s32 %r618, %r507, %r500;
- mul.lo.s32 %r128, %r202, %r3;
- mov.u32 %r493, 0;
+ mul.lo.s32 %r504, %r4, %r122;
+ shl.b32 %r505, %r504, 1;
+ mad.lo.s32 %r506, %r201, %r5, %r505;
+ add.s32 %r612, %r506, %r499;
+ mul.lo.s32 %r127, %r201, %r3;
+ mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
- mov.u32 %r617, %r5;
- mov.u32 %r619, %r493;
+ mov.u32 %r611, %r5;
+ mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
- setp.ge.s32 %p164, %r617, %r11;
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ setp.ge.s32 %p164, %r611, %r11;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
- mul.wide.s32 %rd210, %r618, 4;
+ mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
- ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
+ ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
$L__BB0_242:
- mov.b32 %f558, %r621;
+ mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
- mov.b32 %f559, %r620;
+ mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
- add.s32 %r618, %r618, %r128;
- add.s32 %r617, %r617, %r3;
- add.s32 %r619, %r619, 1;
- setp.lt.s32 %p165, %r619, %r126;
+ add.s32 %r612, %r612, %r127;
+ add.s32 %r611, %r611, %r3;
+ add.s32 %r613, %r613, 1;
+ setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
- clz.b32 %r514, %r3;
- mov.u32 %r515, 31;
- sub.s32 %r516, %r515, %r514;
- mov.u32 %r517, 1;
- shl.b32 %r139, %r517, %r516;
- setp.lt.u32 %p166, %r5, %r139;
- add.s32 %r518, %r139, %r5;
- setp.lt.u32 %p167, %r518, %r3;
+ clz.b32 %r513, %r3;
+ mov.u32 %r514, 31;
+ sub.s32 %r515, %r514, %r513;
+ mov.u32 %r516, 1;
+ shl.b32 %r138, %r516, %r515;
+ setp.lt.u32 %p166, %r5, %r138;
+ add.s32 %r517, %r138, %r5;
+ setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
- add.s32 %r519, %r50, %r139;
- mul.wide.s32 %rd211, %r519, 4;
+ add.s32 %r518, %r49, %r138;
+ mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
- shr.u32 %r520, %r139, 31;
- add.s32 %r521, %r139, %r520;
- shr.s32 %r630, %r521, 1;
+ shr.u32 %r519, %r138, 31;
+ add.s32 %r520, %r138, %r519;
+ shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
@@ -1991,38 +1985,38 @@
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
- setp.lt.s32 %p169, %r139, 4;
+ setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
- mov.u32 %r622, %r630;
+ mov.u32 %r616, %r624;
$L__BB0_247:
- setp.ge.u32 %p170, %r5, %r622;
+ setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
- add.s32 %r522, %r622, %r50;
- mul.wide.s32 %rd213, %r522, 4;
+ add.s32 %r521, %r616, %r49;
+ mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
- shr.u32 %r142, %r622, 1;
- setp.gt.u32 %p171, %r622, 3;
- mov.u32 %r622, %r142;
+ shr.u32 %r141, %r616, 1;
+ setp.gt.u32 %p171, %r616, 3;
+ mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
- add.s32 %r523, %r50, 1;
- mul.wide.u32 %rd216, %r523, 4;
+ add.s32 %r522, %r49, 1;
+ mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
@@ -2050,29 +2044,29 @@
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
- mov.u32 %r623, %r630;
+ mov.u32 %r617, %r624;
$L__BB0_257:
- setp.ge.u32 %p176, %r5, %r623;
+ setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
- add.s32 %r524, %r623, %r50;
- mul.wide.s32 %rd218, %r524, 4;
+ add.s32 %r523, %r617, %r49;
+ mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
- shr.u32 %r144, %r623, 1;
- setp.gt.u32 %p177, %r623, 3;
- mov.u32 %r623, %r144;
+ shr.u32 %r143, %r617, 1;
+ setp.gt.u32 %p177, %r617, 3;
+ mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
@@ -2091,90 +2085,90 @@
{ cvt.rn.f16.f32 %rs130, %f681;}
@%p10 bra $L__BB0_267;
- add.s32 %r525, %r202, 1;
- shr.u32 %r526, %r525, 31;
- add.s32 %r527, %r525, %r526;
- shr.s32 %r528, %r527, 1;
- add.s32 %r529, %r4, %r528;
- add.s32 %r530, %r529, -1;
- div.s32 %r531, %r530, %r4;
- setp.ge.s32 %p181, %r123, %r531;
+ add.s32 %r524, %r201, 1;
+ shr.u32 %r525, %r524, 31;
+ add.s32 %r526, %r524, %r525;
+ shr.s32 %r527, %r526, 1;
+ add.s32 %r528, %r4, %r527;
+ add.s32 %r529, %r528, -1;
+ div.s32 %r530, %r529, %r4;
+ setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
- shl.b32 %r145, %r9, 1;
- mul.lo.s32 %r532, %r4, %r123;
- shl.b32 %r146, %r532, 1;
- add.s32 %r533, %r145, %r146;
- or.b32 %r534, %r533, 1;
- setp.ge.s32 %p182, %r534, %r202;
+ shl.b32 %r144, %r9, 1;
+ mul.lo.s32 %r531, %r4, %r122;
+ shl.b32 %r145, %r531, 1;
+ add.s32 %r532, %r144, %r145;
+ or.b32 %r533, %r532, 1;
+ setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r535, %r146, %r145;
+ add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
- mul.wide.s32 %rd222, %r535, 2;
+ mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
- add.s32 %r537, %r202, 1;
- shr.u32 %r538, %r537, 31;
- add.s32 %r539, %r537, %r538;
- shr.s32 %r540, %r539, 1;
- add.s32 %r541, %r4, %r540;
- add.s32 %r542, %r541, -1;
- shl.b32 %r543, %r9, 1;
- shl.b32 %r544, %r4, 1;
- mad.lo.s32 %r545, %r544, %r123, %r543;
- or.b32 %r546, %r545, 1;
- setp.ge.s32 %p184, %r546, %r202;
- div.s32 %r547, %r542, %r4;
- setp.ge.s32 %p185, %r123, %r547;
+ add.s32 %r536, %r201, 1;
+ shr.u32 %r537, %r536, 31;
+ add.s32 %r538, %r536, %r537;
+ shr.s32 %r539, %r538, 1;
+ add.s32 %r540, %r4, %r539;
+ add.s32 %r541, %r540, -1;
+ shl.b32 %r542, %r9, 1;
+ shl.b32 %r543, %r4, 1;
+ mad.lo.s32 %r544, %r543, %r122, %r542;
+ or.b32 %r545, %r544, 1;
+ setp.ge.s32 %p184, %r545, %r201;
+ div.s32 %r546, %r541, %r4;
+ setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
- mul.lo.s32 %r548, %r4, %r123;
- shl.b32 %r549, %r548, 1;
- mad.lo.s32 %r550, %r202, %r5, %r549;
- add.s32 %r625, %r550, %r543;
- mul.lo.s32 %r148, %r202, %r3;
- mov.u32 %r536, 0;
+ mul.lo.s32 %r547, %r4, %r122;
+ shl.b32 %r548, %r547, 1;
+ mad.lo.s32 %r549, %r201, %r5, %r548;
+ add.s32 %r619, %r549, %r542;
+ mul.lo.s32 %r147, %r201, %r3;
+ mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
- mov.u32 %r624, %r5;
- mov.u32 %r626, %r536;
+ mov.u32 %r618, %r5;
+ mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
- setp.ge.s32 %p186, %r624, %r11;
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ setp.ge.s32 %p186, %r618, %r11;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
- mul.wide.s32 %rd225, %r625, 4;
+ mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
- ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
+ ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
$L__BB0_272:
- mov.b32 %f584, %r628;
+ mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
- mov.b32 %f585, %r627;
+ mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
- add.s32 %r625, %r625, %r148;
- add.s32 %r624, %r624, %r3;
- add.s32 %r626, %r626, 1;
- setp.lt.s32 %p187, %r626, %r126;
+ add.s32 %r619, %r619, %r147;
+ add.s32 %r618, %r618, %r3;
+ add.s32 %r620, %r620, 1;
+ setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@@ -2187,29 +2181,29 @@
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
- mov.u32 %r629, %r630;
+ mov.u32 %r623, %r624;
$L__BB0_277:
- setp.ge.u32 %p190, %r5, %r629;
+ setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
- add.s32 %r557, %r629, %r50;
- mul.wide.s32 %rd226, %r557, 4;
+ add.s32 %r556, %r623, %r49;
+ mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
- shr.u32 %r160, %r629, 1;
- setp.gt.u32 %p191, %r629, 3;
- mov.u32 %r629, %r160;
+ shr.u32 %r159, %r623, 1;
+ setp.gt.u32 %p191, %r623, 3;
+ mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
@@ -2240,26 +2234,26 @@
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
- setp.ge.u32 %p196, %r5, %r630;
+ setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
- add.s32 %r558, %r630, %r50;
- mul.wide.s32 %rd229, %r558, 4;
+ add.s32 %r557, %r624, %r49;
+ mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
- shr.u32 %r162, %r630, 1;
- setp.gt.u32 %p197, %r630, 3;
- mov.u32 %r630, %r162;
+ shr.u32 %r161, %r624, 1;
+ setp.gt.u32 %p197, %r624, 3;
+ mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
@@ -2278,32 +2272,32 @@
{ cvt.rn.f16.f32 %rs132, %f687;}
@%p10 bra $L__BB0_296;
- add.s32 %r559, %r202, 1;
- shr.u32 %r560, %r559, 31;
- add.s32 %r561, %r559, %r560;
- shr.s32 %r562, %r561, 1;
- add.s32 %r563, %r4, %r562;
- add.s32 %r564, %r563, -1;
- div.s32 %r565, %r564, %r4;
- setp.ge.s32 %p201, %r123, %r565;
+ add.s32 %r558, %r201, 1;
+ shr.u32 %r559, %r558, 31;
+ add.s32 %r560, %r558, %r559;
+ shr.s32 %r561, %r560, 1;
+ add.s32 %r562, %r4, %r561;
+ add.s32 %r563, %r562, -1;
+ div.s32 %r564, %r563, %r4;
+ setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
- shl.b32 %r163, %r9, 1;
- mul.lo.s32 %r566, %r4, %r123;
- shl.b32 %r164, %r566, 1;
- add.s32 %r567, %r163, %r164;
- or.b32 %r568, %r567, 1;
- setp.ge.s32 %p202, %r568, %r202;
+ shl.b32 %r162, %r9, 1;
+ mul.lo.s32 %r565, %r4, %r122;
+ shl.b32 %r163, %r565, 1;
+ add.s32 %r566, %r162, %r163;
+ or.b32 %r567, %r566, 1;
+ setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r569, %r164, %r163;
+ add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
- mul.wide.s32 %rd233, %r569, 2;
+ mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
15: CombinedSchedulerTest.LayerNormBackward/dtype___half_batch_216_hidden_576
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 64
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<200>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<610>;
.reg .f64 %fd<3>;
.reg .b64 %rd<237>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r212, %r213}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r222, %r223}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r226, %r227}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r248, %r213, 7;
shr.s32 %r249, %r248, 31;
shr.u32 %r250, %r249, 29;
add.s32 %r251, %r248, %r250;
shr.s32 %r2, %r251, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_2;
mov.u32 %r252, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r252;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd44, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r253, [%rd44], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r254, %r4, %r2;
shl.b32 %r255, %r254, 4;
or.b32 %r256, %r255, 15;
and.b32 %r7, %r256, -16;
add.s32 %r257, %r256, %r7;
and.b32 %r258, %r257, -16;
cvt.s64.s32 %rd1, %r258;
shl.b32 %r259, %r4, 2;
max.s32 %r260, %r2, %r3;
mad.lo.s32 %r261, %r259, %r260, 15;
and.b32 %r262, %r261, -16;
cvt.u64.u32 %rd2, %r262;
mov.u64 %rd45, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_103395arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r263, %r8, 7;
setp.lt.s32 %p9, %r263, %r213;
setp.lt.s32 %p10, %r5, %r2;
and.pred %p1, %p9, %p10;
not.pred %p11, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p12, %r9, 0;
or.pred %p13, %p12, %p11;
@%p13 bra $L__BB0_4;
add.s64 %rd47, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r264, smem_ptr; }
// end inline asm
shl.b32 %r267, %r5, 4;
add.s32 %r265, %r264, %r267;
mul.wide.s32 %rd49, %r8, 2;
add.s64 %rd48, %rd37, %rd49;
mov.u32 %r266, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r266, 0;
cp.async.ca.shared.global [%r265], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r562, %r6, 4;
add.s32 %r268, %r4, 215;
div.s32 %r269, %r268, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r270, %r11, %r269;
add.s32 %r271, %r270, -1;
div.s32 %r12, %r271, %r11;
setp.gt.s32 %p14, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p14 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r213;
cvt.s64.s32 %rd50, %r7;
add.s64 %rd51, %rd50, %rd2;
add.s64 %rd53, %rd45, %rd2;
mov.u32 %r273, %ctaid.y;
mul.lo.s32 %r274, %r12, %r4;
mul.lo.s32 %r13, %r274, %r273;
shl.b32 %r275, %r9, 1;
mov.u32 %r276, 1;
shl.b32 %r277, %r5, 4;
mad.lo.s32 %r14, %r275, %r213, %r277;
mul.lo.s32 %r278, %r213, %r9;
cvt.s64.s32 %rd54, %r278;
cvt.s64.s32 %rd55, %r8;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r279, %r13, %r213;
cvt.s64.s32 %rd6, %r279;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r280, %tid.z;
mad.lo.s32 %r281, %r4, %r280, %r9;
mad.lo.s32 %r15, %r281, %r3, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd7, %rd45, %rd56;
clz.b32 %r282, %r3;
mov.u32 %r283, 31;
sub.s32 %r284, %r283, %r282;
shl.b32 %r16, %r276, %r284;
setp.lt.u32 %p15, %r5, %r16;
add.s32 %r285, %r16, %r5;
setp.lt.u32 %p16, %r285, %r3;
and.pred %p2, %p15, %p16;
add.s32 %r286, %r15, %r16;
mul.wide.s32 %rd57, %r286, 4;
add.s64 %rd8, %rd45, %rd57;
shr.u32 %r287, %r16, 31;
add.s32 %r288, %r16, %r287;
shr.s32 %r17, %r288, 1;
add.s32 %r18, %r278, %r8;
add.s64 %rd58, %rd45, %rd51;
mul.wide.s32 %rd59, %r18, 2;
add.s64 %rd9, %rd58, %rd59;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r8, 2;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r289, %r15, 1;
mul.wide.u32 %rd62, %r289, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd59;
mul.wide.s32 %rd63, %r281, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd46, %rd51;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd35;
mov.u32 %r559, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r292, smem_ptr; }
// end inline asm
add.s32 %r293, %r14, %r292;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r295, smem_ptr; }
// end inline asm
add.s32 %r296, %r14, %r295;
not.pred %p22, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r559, %r4;
add.s32 %r290, %r23, %r9;
add.s32 %r24, %r290, %r13;
setp.gt.s32 %p17, %r24, 215;
mov.f32 %f622, %f187;
@%p17 bra $L__BB0_9;
mul.lo.s32 %r291, %r24, %r222;
mul.wide.s32 %rd65, %r291, 4;
add.s64 %rd66, %rd17, %rd65;
ld.global.f32 %f622, [%rd66];
$L__BB0_9:
setp.lt.s32 %p18, %r24, 216;
and.pred %p3, %p1, %p18;
not.pred %p19, %p3;
@%p19 bra $L__BB0_11;
mul.lo.s32 %r557, %r559, %r4;
mul.lo.s32 %r298, %r557, %r213;
cvt.s64.s32 %rd71, %r298;
add.s64 %rd72, %rd5, %rd71;
add.s64 %rd73, %rd72, %rd6;
shl.b64 %rd74, %rd73, 1;
add.s64 %rd68, %rd34, %rd74;
mov.u32 %r297, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r293], [%rd68], 16, p0;
}
// end inline asm
add.s64 %rd70, %rd33, %rd74;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r296], [%rd70], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r555, %r559, %r4;
add.s32 %r554, %r555, %r9;
add.s32 %r553, %r554, %r13;
setp.gt.s32 %p199, %r553, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p199 bra $L__BB0_13;
mul.lo.s32 %r299, %r24, %r226;
mul.wide.s32 %rd75, %r299, 4;
add.s64 %rd76, %rd16, %rd75;
ld.global.f32 %f623, [%rd76];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r304, %r305, %r306, %r307}, [%rd9];
ld.shared.v4.u32 {%r312, %r313, %r314, %r315}, [%rd10];
ld.shared.v4.u32 {%r320, %r321, %r322, %r323}, [%rd12];
mov.b32 {%rs36, %rs39}, %r320;
// begin inline asm
{ cvt.f32.f16 %f221, %rs36;}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r312;
// begin inline asm
{ cvt.f32.f16 %f222, %rs37;}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r304;
// begin inline asm
{ cvt.f32.f16 %f223, %rs38;}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f224, %rs39;}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ cvt.f32.f16 %f225, %rs40;}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ cvt.f32.f16 %f226, %rs41;}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r321;
// begin inline asm
{ cvt.f32.f16 %f227, %rs42;}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r313;
// begin inline asm
{ cvt.f32.f16 %f228, %rs43;}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r305;
// begin inline asm
{ cvt.f32.f16 %f229, %rs44;}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ cvt.f32.f16 %f230, %rs45;}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ cvt.f32.f16 %f231, %rs46;}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ cvt.f32.f16 %f232, %rs47;}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r322;
// begin inline asm
{ cvt.f32.f16 %f233, %rs48;}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r314;
// begin inline asm
{ cvt.f32.f16 %f234, %rs49;}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r306;
// begin inline asm
{ cvt.f32.f16 %f235, %rs50;}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ cvt.f32.f16 %f236, %rs51;}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ cvt.f32.f16 %f237, %rs52;}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ cvt.f32.f16 %f238, %rs53;}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r323;
// begin inline asm
{ cvt.f32.f16 %f239, %rs54;}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r315;
// begin inline asm
{ cvt.f32.f16 %f240, %rs55;}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r307;
// begin inline asm
{ cvt.f32.f16 %f241, %rs56;}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ cvt.f32.f16 %f242, %rs57;}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ cvt.f32.f16 %f243, %rs58;}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ cvt.f32.f16 %f244, %rs59;}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r562, %r562, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p22 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p23, %r16, 4;
bar.sync 0;
@%p23 bra $L__BB0_23;
mov.u32 %r560, %r17;
$L__BB0_20:
setp.ge.u32 %p24, %r5, %r560;
@%p24 bra $L__BB0_22;
add.s32 %r328, %r560, %r15;
mul.wide.s32 %rd77, %r328, 4;
add.s64 %rd79, %rd45, %rd77;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd79];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r36, %r560, 1;
setp.gt.u32 %p25, %r560, 3;
mov.u32 %r560, %r36;
@%p25 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p8 bra $L__BB0_26;
setp.lt.u32 %p27, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p27 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p22 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p198, %r16, 4;
bar.sync 0;
@%p198 bra $L__BB0_33;
mov.u32 %r561, %r17;
$L__BB0_30:
setp.ge.u32 %p30, %r5, %r561;
@%p30 bra $L__BB0_32;
add.s32 %r329, %r561, %r15;
mul.wide.s32 %rd80, %r329, 4;
add.s64 %rd82, %rd45, %rd80;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd82];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r38, %r561, 1;
setp.gt.u32 %p31, %r561, 3;
mov.u32 %r561, %r38;
@%p31 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p8 bra $L__BB0_36;
setp.lt.u32 %p33, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p33 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p8 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p8 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
mul.lo.s32 %r556, %r559, %r4;
ld.shared.v4.u32 {%r338, %r339, %r340, %r341}, [%rd9];
ld.shared.v4.u32 {%r346, %r347, %r348, %r349}, [%rd10];
ld.shared.v4.u32 {%r354, %r355, %r356, %r357}, [%rd12];
mov.b32 {%rs97, %rs101}, %r346;
// begin inline asm
{ cvt.f32.f16 %f338, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r354;
// begin inline asm
{ cvt.f32.f16 %f339, %rs98;}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r338;
// begin inline asm
{ cvt.f32.f16 %f340, %rs99;}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ cvt.f32.f16 %f342, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f343, %rs102;}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ cvt.f32.f16 %f344, %rs103;}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r334, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r347;
// begin inline asm
{ cvt.f32.f16 %f346, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r355;
// begin inline asm
{ cvt.f32.f16 %f347, %rs106;}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r339;
// begin inline asm
{ cvt.f32.f16 %f348, %rs107;}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ cvt.f32.f16 %f350, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f351, %rs110;}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ cvt.f32.f16 %f352, %rs111;}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r335, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r348;
// begin inline asm
{ cvt.f32.f16 %f354, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r356;
// begin inline asm
{ cvt.f32.f16 %f355, %rs114;}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r340;
// begin inline asm
{ cvt.f32.f16 %f356, %rs115;}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ cvt.f32.f16 %f358, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f359, %rs118;}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ cvt.f32.f16 %f360, %rs119;}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r336, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r349;
// begin inline asm
{ cvt.f32.f16 %f362, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r357;
// begin inline asm
{ cvt.f32.f16 %f363, %rs122;}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r341;
// begin inline asm
{ cvt.f32.f16 %f364, %rs123;}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ cvt.f32.f16 %f366, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f367, %rs126;}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ cvt.f32.f16 %f368, %rs127;}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r337, {%rs124, %rs128};
add.s32 %r362, %r13, %r556;
mad.lo.s32 %r363, %r362, %r213, %r18;
mul.wide.s32 %rd84, %r363, 2;
add.s64 %rd83, %rd38, %rd84;
// begin inline asm
st.global.cs.v4.s32 [%rd83], {%r334,%r335,%r336,%r337};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r559, %r559, 1;
setp.lt.s32 %p37, %r559, %r12;
@%p37 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r364, %tid.z;
mad.lo.s32 %r365, %r4, %r364, %r9;
mad.lo.s32 %r50, %r365, %r3, %r5;
mul.wide.u32 %rd85, %r50, 4;
add.s64 %rd23, %rd45, %rd85;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r366, %r4;
mov.u32 %r367, 31;
sub.s32 %r51, %r367, %r366;
mov.u32 %r368, 1;
shl.b32 %r593, %r368, %r51;
setp.lt.u32 %p38, %r9, %r593;
add.s32 %r369, %r593, %r9;
setp.lt.u32 %p39, %r369, %r4;
and.pred %p4, %p38, %p39;
not.pred %p40, %p4;
@%p40 bra $L__BB0_46;
shl.b32 %r370, %r3, %r51;
add.s32 %r371, %r50, %r370;
mul.wide.s32 %rd87, %r371, 4;
add.s64 %rd89, %rd45, %rd87;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd89];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p41, %r593, 4;
@%p41 bra $L__BB0_51;
mov.u32 %r563, %r593;
$L__BB0_48:
shr.u32 %r54, %r563, 1;
setp.ge.u32 %p42, %r9, %r54;
@%p42 bra $L__BB0_50;
mad.lo.s32 %r372, %r54, %r3, %r50;
mul.wide.s32 %rd90, %r372, 4;
add.s64 %rd92, %rd45, %rd90;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd92];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p43, %r563, 7;
mov.u32 %r563, %r54;
@%p43 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r564, 0;
add.s32 %r374, %r50, %r3;
mul.wide.u32 %rd93, %r374, 4;
add.s64 %rd24, %rd45, %rd93;
@%p12 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p45, %r4, 2;
@%p45 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r564, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p40 bra $L__BB0_57;
shl.b32 %r375, %r3, %r51;
add.s32 %r376, %r50, %r375;
mul.wide.s32 %rd95, %r376, 4;
add.s64 %rd97, %rd45, %rd95;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd97];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p41 bra $L__BB0_62;
mov.u32 %r565, %r593;
$L__BB0_59:
shr.u32 %r58, %r565, 1;
setp.ge.u32 %p48, %r9, %r58;
@%p48 bra $L__BB0_61;
mad.lo.s32 %r377, %r58, %r3, %r50;
mul.wide.s32 %rd98, %r377, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd100];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p49, %r565, 7;
mov.u32 %r565, %r58;
@%p49 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r566, 0;
@%p12 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p51, %r4, 2;
@%p51 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r566, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p40 bra $L__BB0_68;
shl.b32 %r379, %r3, %r51;
add.s32 %r380, %r50, %r379;
mul.wide.s32 %rd101, %r380, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd103];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p41 bra $L__BB0_73;
mov.u32 %r567, %r593;
$L__BB0_70:
shr.u32 %r62, %r567, 1;
setp.ge.u32 %p54, %r9, %r62;
@%p54 bra $L__BB0_72;
mad.lo.s32 %r381, %r62, %r3, %r50;
mul.wide.s32 %rd104, %r381, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd106];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p55, %r567, 7;
mov.u32 %r567, %r62;
@%p55 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r568, 0;
@%p12 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r568, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p40 bra $L__BB0_79;
shl.b32 %r383, %r3, %r51;
add.s32 %r384, %r50, %r383;
mul.wide.s32 %rd107, %r384, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd109];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p41 bra $L__BB0_84;
mov.u32 %r569, %r593;
$L__BB0_81:
shr.u32 %r66, %r569, 1;
setp.ge.u32 %p60, %r9, %r66;
@%p60 bra $L__BB0_83;
mad.lo.s32 %r385, %r66, %r3, %r50;
mul.wide.s32 %rd110, %r385, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd112];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p61, %r569, 7;
mov.u32 %r569, %r66;
@%p61 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r570, 0;
@%p12 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r570, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p40 bra $L__BB0_90;
shl.b32 %r387, %r3, %r51;
add.s32 %r388, %r50, %r387;
mul.wide.s32 %rd113, %r388, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd115];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p41 bra $L__BB0_95;
mov.u32 %r571, %r593;
$L__BB0_92:
shr.u32 %r70, %r571, 1;
setp.ge.u32 %p66, %r9, %r70;
@%p66 bra $L__BB0_94;
mad.lo.s32 %r389, %r70, %r3, %r50;
mul.wide.s32 %rd116, %r389, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd118];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p67, %r571, 7;
mov.u32 %r571, %r70;
@%p67 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r572, 0;
@%p12 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r572, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p40 bra $L__BB0_101;
shl.b32 %r391, %r3, %r51;
add.s32 %r392, %r50, %r391;
mul.wide.s32 %rd119, %r392, 4;
add.s64 %rd121, %rd45, %rd119;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd121];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p41 bra $L__BB0_106;
mov.u32 %r573, %r593;
$L__BB0_103:
shr.u32 %r74, %r573, 1;
setp.ge.u32 %p72, %r9, %r74;
@%p72 bra $L__BB0_105;
mad.lo.s32 %r393, %r74, %r3, %r50;
mul.wide.s32 %rd122, %r393, 4;
add.s64 %rd124, %rd45, %rd122;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd124];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p73, %r573, 7;
mov.u32 %r573, %r74;
@%p73 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r574, 0;
@%p12 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r574, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p40 bra $L__BB0_112;
shl.b32 %r395, %r3, %r51;
add.s32 %r396, %r50, %r395;
mul.wide.s32 %rd125, %r396, 4;
add.s64 %rd127, %rd45, %rd125;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd127];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p41 bra $L__BB0_117;
mov.u32 %r575, %r593;
$L__BB0_114:
shr.u32 %r78, %r575, 1;
setp.ge.u32 %p78, %r9, %r78;
@%p78 bra $L__BB0_116;
mad.lo.s32 %r397, %r78, %r3, %r50;
mul.wide.s32 %rd128, %r397, 4;
add.s64 %rd130, %rd45, %rd128;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd130];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p79, %r575, 7;
mov.u32 %r575, %r78;
@%p79 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r576, 0;
@%p12 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r576, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p40 bra $L__BB0_123;
shl.b32 %r399, %r3, %r51;
add.s32 %r400, %r50, %r399;
mul.wide.s32 %rd131, %r400, 4;
add.s64 %rd133, %rd45, %rd131;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd133];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p41 bra $L__BB0_128;
mov.u32 %r577, %r593;
$L__BB0_125:
shr.u32 %r82, %r577, 1;
setp.ge.u32 %p84, %r9, %r82;
@%p84 bra $L__BB0_127;
mad.lo.s32 %r401, %r82, %r3, %r50;
mul.wide.s32 %rd134, %r401, 4;
add.s64 %rd136, %rd45, %rd134;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd136];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p85, %r577, 7;
mov.u32 %r577, %r82;
@%p85 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r578, 0;
@%p12 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r578, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r85, %r562, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p40 bra $L__BB0_134;
shl.b32 %r403, %r3, %r51;
add.s32 %r404, %r50, %r403;
mul.wide.s32 %rd137, %r404, 4;
add.s64 %rd139, %rd45, %rd137;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd139];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p41 bra $L__BB0_139;
mov.u32 %r579, %r593;
$L__BB0_136:
shr.u32 %r87, %r579, 1;
setp.ge.u32 %p90, %r9, %r87;
@%p90 bra $L__BB0_138;
mad.lo.s32 %r405, %r87, %r3, %r50;
mul.wide.s32 %rd140, %r405, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd142];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p91, %r579, 7;
mov.u32 %r579, %r87;
@%p91 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r580, 0;
@%p12 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r580, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p40 bra $L__BB0_145;
shl.b32 %r407, %r3, %r51;
add.s32 %r408, %r50, %r407;
mul.wide.s32 %rd143, %r408, 4;
add.s64 %rd145, %rd45, %rd143;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd145];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p41 bra $L__BB0_150;
mov.u32 %r581, %r593;
$L__BB0_147:
shr.u32 %r91, %r581, 1;
setp.ge.u32 %p96, %r9, %r91;
@%p96 bra $L__BB0_149;
mad.lo.s32 %r409, %r91, %r3, %r50;
mul.wide.s32 %rd146, %r409, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd148];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p97, %r581, 7;
mov.u32 %r581, %r91;
@%p97 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r582, 0;
@%p12 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r582, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p40 bra $L__BB0_156;
shl.b32 %r411, %r3, %r51;
add.s32 %r412, %r50, %r411;
mul.wide.s32 %rd149, %r412, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd151];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p41 bra $L__BB0_161;
mov.u32 %r583, %r593;
$L__BB0_158:
shr.u32 %r95, %r583, 1;
setp.ge.u32 %p102, %r9, %r95;
@%p102 bra $L__BB0_160;
mad.lo.s32 %r413, %r95, %r3, %r50;
mul.wide.s32 %rd152, %r413, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd154];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p103, %r583, 7;
mov.u32 %r583, %r95;
@%p103 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r584, 0;
@%p12 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r584, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p40 bra $L__BB0_167;
shl.b32 %r415, %r3, %r51;
add.s32 %r416, %r50, %r415;
mul.wide.s32 %rd155, %r416, 4;
add.s64 %rd157, %rd45, %rd155;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd157];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p41 bra $L__BB0_172;
mov.u32 %r585, %r593;
$L__BB0_169:
shr.u32 %r99, %r585, 1;
setp.ge.u32 %p108, %r9, %r99;
@%p108 bra $L__BB0_171;
mad.lo.s32 %r417, %r99, %r3, %r50;
mul.wide.s32 %rd158, %r417, 4;
add.s64 %rd160, %rd45, %rd158;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd160];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p109, %r585, 7;
mov.u32 %r585, %r99;
@%p109 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r586, 0;
@%p12 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r586, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p40 bra $L__BB0_178;
shl.b32 %r419, %r3, %r51;
add.s32 %r420, %r50, %r419;
mul.wide.s32 %rd161, %r420, 4;
add.s64 %rd163, %rd45, %rd161;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd163];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p41 bra $L__BB0_183;
mov.u32 %r587, %r593;
$L__BB0_180:
shr.u32 %r103, %r587, 1;
setp.ge.u32 %p114, %r9, %r103;
@%p114 bra $L__BB0_182;
mad.lo.s32 %r421, %r103, %r3, %r50;
mul.wide.s32 %rd164, %r421, 4;
add.s64 %rd166, %rd45, %rd164;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd166];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p115, %r587, 7;
mov.u32 %r587, %r103;
@%p115 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r588, 0;
@%p12 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r588, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p40 bra $L__BB0_189;
shl.b32 %r423, %r3, %r51;
add.s32 %r424, %r50, %r423;
mul.wide.s32 %rd167, %r424, 4;
add.s64 %rd169, %rd45, %rd167;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd169];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p41 bra $L__BB0_194;
mov.u32 %r589, %r593;
$L__BB0_191:
shr.u32 %r107, %r589, 1;
setp.ge.u32 %p120, %r9, %r107;
@%p120 bra $L__BB0_193;
mad.lo.s32 %r425, %r107, %r3, %r50;
mul.wide.s32 %rd170, %r425, 4;
add.s64 %rd172, %rd45, %rd170;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd172];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p121, %r589, 7;
mov.u32 %r589, %r107;
@%p121 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r590, 0;
@%p12 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r590, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p40 bra $L__BB0_200;
shl.b32 %r427, %r3, %r51;
add.s32 %r428, %r50, %r427;
mul.wide.s32 %rd173, %r428, 4;
add.s64 %rd175, %rd45, %rd173;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd175];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p41 bra $L__BB0_205;
mov.u32 %r591, %r593;
$L__BB0_202:
shr.u32 %r111, %r591, 1;
setp.ge.u32 %p126, %r9, %r111;
@%p126 bra $L__BB0_204;
mad.lo.s32 %r429, %r111, %r3, %r50;
mul.wide.s32 %rd176, %r429, 4;
add.s64 %rd178, %rd45, %rd176;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd178];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p127, %r591, 7;
mov.u32 %r591, %r111;
@%p127 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r592, 0;
@%p12 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r592, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p40 bra $L__BB0_211;
shl.b32 %r431, %r3, %r51;
add.s32 %r432, %r50, %r431;
mul.wide.s32 %rd179, %r432, 4;
add.s64 %rd181, %rd45, %rd179;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd181];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p41 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r115, %r593, 1;
setp.ge.u32 %p132, %r9, %r115;
@%p132 bra $L__BB0_214;
mad.lo.s32 %r433, %r115, %r3, %r50;
mul.wide.s32 %rd182, %r433, 4;
add.s64 %rd184, %rd45, %rd182;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd184];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p133, %r593, 7;
mov.u32 %r593, %r115;
@%p133 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r594, 0;
@%p12 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r594, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p12 bra $L__BB0_226;
shl.b32 %r552, %r5, 3;
mov.u32 %r459, %ctaid.y;
mad.lo.s32 %r460, %r213, %r459, %r552;
add.s32 %r461, %r460, %r85;
mul.wide.s32 %rd191, %r461, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r564,%r566,%r568,%r570};
// end inline asm
add.s32 %r462, %r461, 4;
mul.wide.s32 %rd192, %r462, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r572,%r574,%r576,%r578};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
shl.b32 %r549, %r5, 3;
setp.eq.s32 %p136, %r9, 0;
and.pred %p5, %p136, %p10;
not.pred %p138, %p5;
add.s32 %r435, %r549, 3;
sub.s32 %r118, %r435, %r213;
mov.u32 %r436, %ctaid.y;
mad.lo.s32 %r119, %r213, %r436, %r549;
neg.s32 %r437, %r85;
setp.ge.s32 %p139, %r118, %r437;
or.pred %p140, %p138, %p139;
@%p140 bra $L__BB0_222;
add.s32 %r442, %r119, %r85;
mul.wide.s32 %rd186, %r442, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r564,%r566,%r568,%r570};
// end inline asm
$L__BB0_222:
mov.u32 %r443, -4;
sub.s32 %r444, %r443, %r85;
setp.ge.s32 %p141, %r118, %r444;
or.pred %p143, %p138, %p141;
@%p143 bra $L__BB0_226;
add.s32 %r449, %r119, %r85;
add.s32 %r450, %r449, 4;
mul.wide.s32 %rd188, %r450, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r572,%r574,%r576,%r578};
// end inline asm
$L__BB0_226:
@%p1 bra $L__BB0_233;
bra.uni $L__BB0_227;
$L__BB0_233:
@%p12 bra $L__BB0_235;
shl.b32 %r551, %r5, 3;
shl.b32 %r487, %r562, 5;
mov.u32 %r488, %ctaid.y;
mad.lo.s32 %r489, %r213, %r488, %r551;
add.s32 %r490, %r489, %r487;
mul.wide.s32 %rd199, %r490, 4;
add.s64 %rd197, %rd42, %rd199;
// begin inline asm
st.volatile.global.v4.s32 [%rd197], {%r580,%r582,%r584,%r586};
// end inline asm
add.s32 %r491, %r490, 4;
mul.wide.s32 %rd200, %r491, 4;
add.s64 %rd198, %rd42, %rd200;
// begin inline asm
st.volatile.global.v4.s32 [%rd198], {%r588,%r590,%r592,%r594};
// end inline asm
bra.uni $L__BB0_235;
$L__BB0_227:
shl.b32 %r550, %r5, 3;
setp.eq.s32 %p145, %r9, 0;
and.pred %p6, %p145, %p10;
add.s32 %r463, %r550, 3;
sub.s32 %r120, %r463, %r213;
mov.u32 %r464, %ctaid.y;
mad.lo.s32 %r121, %r213, %r464, %r550;
not.pred %p147, %p6;
@%p147 bra $L__BB0_230;
shl.b32 %r122, %r562, 5;
neg.s32 %r465, %r122;
setp.ge.s32 %p148, %r120, %r465;
@%p148 bra $L__BB0_230;
add.s32 %r470, %r121, %r122;
mul.wide.s32 %rd194, %r470, 4;
add.s64 %rd193, %rd42, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r580,%r582,%r584,%r586};
// end inline asm
$L__BB0_230:
@%p147 bra $L__BB0_235;
shl.b32 %r123, %r562, 5;
mov.u32 %r471, -4;
sub.s32 %r472, %r471, %r123;
setp.ge.s32 %p150, %r120, %r472;
@%p150 bra $L__BB0_235;
add.s32 %r477, %r121, %r123;
add.s32 %r478, %r477, 4;
mul.wide.s32 %rd196, %r478, 4;
add.s64 %rd195, %rd42, %rd196;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r588,%r590,%r592,%r594};
// end inline asm
$L__BB0_235:
mov.u32 %r124, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r492, %r5, %r9;
or.b32 %r494, %r492, %r364;
setp.ne.s32 %p152, %r494, 0;
@%p152 bra $L__BB0_239;
ld.param.u64 %rd236, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd201, %rd236;
mov.u32 %r495, %ctaid.x;
mov.u32 %r496, %ctaid.z;
mov.u32 %r497, %nctaid.x;
mad.lo.s32 %r498, %r496, %r497, %r495;
mul.wide.s32 %rd202, %r498, 8;
add.s64 %rd27, %rd201, %rd202;
add.s32 %r499, %r11, -1;
setp.eq.s32 %p153, %r124, %r499;
cvt.s64.s32 %rd203, %r11;
mov.u64 %rd204, -9223372036854775807;
sub.s64 %rd205, %rd204, %rd203;
selp.b64 %rd206, %rd205, 1, %p153;
atom.global.add.u64 %rd28, [%rd27], %rd206;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.lt.s64 %p154, %rd208, 0;
@%p154 bra $L__BB0_239;
mov.u32 %r595, 8;
$L__BB0_238:
// begin inline asm
nanosleep.u32 %r595;
// end inline asm
setp.lt.u32 %p155, %r595, 256;
selp.u32 %r502, 1, 0, %p155;
shl.b32 %r595, %r595, %r502;
ld.volatile.global.u64 %rd209, [%rd27];
xor.b64 %rd210, %rd209, %rd28;
setp.gt.s64 %p156, %rd210, -1;
@%p156 bra $L__BB0_238;
$L__BB0_239:
ld.param.u64 %rd235, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd234, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_55e55727_1033910nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
mov.u32 %r504, 1;
add.s32 %r505, %r213, 1;
shr.u32 %r506, %r505, 31;
add.s32 %r507, %r505, %r506;
shr.s32 %r508, %r507, 1;
add.s32 %r509, %r4, %r508;
add.s32 %r510, %r509, -1;
div.s32 %r511, %r510, %r4;
add.s32 %r512, %r11, -1;
add.s32 %r513, %r512, %r511;
div.s32 %r127, %r513, %r11;
add.s32 %r128, %r512, %r3;
shl.b32 %r129, %r9, 1;
shl.b32 %r514, %r4, 1;
mad.lo.s32 %r132, %r514, %r124, %r129;
or.b32 %r130, %r132, 1;
mul.lo.s32 %r131, %r514, %r11;
clz.b32 %r515, %r3;
mov.u32 %r516, 31;
sub.s32 %r517, %r516, %r515;
shl.b32 %r133, %r504, %r517;
setp.lt.u32 %p157, %r5, %r133;
add.s32 %r518, %r133, %r5;
setp.lt.u32 %p158, %r518, %r3;
and.pred %p7, %p157, %p158;
add.s32 %r519, %r50, %r133;
mul.wide.s32 %rd211, %r519, 4;
add.s64 %rd29, %rd45, %rd211;
shr.u32 %r520, %r133, 31;
add.s32 %r521, %r133, %r520;
shr.s32 %r134, %r521, 1;
add.s32 %r522, %r50, 1;
mul.wide.u32 %rd213, %r522, 4;
add.s64 %rd30, %rd45, %rd213;
cvta.to.global.u64 %rd31, %rd234;
cvta.to.global.u64 %rd32, %rd235;
mov.u32 %r596, 0;
not.pred %p184, %p7;
bra.uni $L__BB0_240;
$L__BB0_303:
add.s32 %r596, %r596, 1;
$L__BB0_240:
.pragma "nounroll";
setp.lt.s32 %p159, %r596, %r127;
@%p159 bra $L__BB0_274;
bra.uni $L__BB0_241;
$L__BB0_274:
div.s32 %r160, %r128, %r3;
setp.lt.s32 %p180, %r160, 1;
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p180 bra $L__BB0_280;
mul.lo.s32 %r537, %r131, %r596;
add.s32 %r161, %r130, %r537;
add.s32 %r162, %r132, %r537;
mov.u32 %r536, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r605, %r536;
$L__BB0_276:
.pragma "nounroll";
setp.ge.s32 %p181, %r161, %r213;
mov.u32 %r606, %r536;
mov.u32 %r607, %r536;
@%p181 bra $L__BB0_279;
mad.lo.s32 %r164, %r605, %r3, %r5;
setp.ge.s32 %p182, %r164, %r11;
mov.u32 %r606, %r536;
mov.u32 %r607, %r536;
@%p182 bra $L__BB0_279;
mad.lo.s32 %r544, %r164, %r213, %r162;
mul.wide.s32 %rd225, %r544, 4;
add.s64 %rd224, %rd41, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r607,%r606}, [%rd224];
// end inline asm
$L__BB0_279:
mov.b32 %f584, %r607;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r606;
add.f32 %f684, %f684, %f585;
add.s32 %r605, %r605, 1;
setp.lt.s32 %p183, %r605, %r160;
@%p183 bra $L__BB0_276;
$L__BB0_280:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p184 bra $L__BB0_282;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_282:
setp.lt.s32 %p185, %r133, 4;
bar.sync 0;
@%p185 bra $L__BB0_287;
mov.u32 %r608, %r134;
$L__BB0_284:
setp.ge.u32 %p186, %r5, %r608;
@%p186 bra $L__BB0_286;
add.s32 %r545, %r608, %r50;
mul.wide.s32 %rd226, %r545, 4;
add.s64 %rd228, %rd45, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_286:
bar.sync 0;
shr.u32 %r171, %r608, 1;
setp.gt.u32 %p187, %r608, 3;
mov.u32 %r608, %r171;
@%p187 bra $L__BB0_284;
$L__BB0_287:
mov.f32 %f686, 0f00000000;
@%p8 bra $L__BB0_290;
setp.lt.u32 %p189, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p189 bra $L__BB0_290;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_290:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p184 bra $L__BB0_292;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_292:
bar.sync 0;
@%p185 bra $L__BB0_297;
mov.u32 %r609, %r134;
$L__BB0_294:
setp.ge.u32 %p192, %r5, %r609;
@%p192 bra $L__BB0_296;
add.s32 %r546, %r609, %r50;
mul.wide.s32 %rd229, %r546, 4;
add.s64 %rd231, %rd45, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_296:
bar.sync 0;
shr.u32 %r173, %r609, 1;
setp.gt.u32 %p193, %r609, 3;
mov.u32 %r609, %r173;
@%p193 bra $L__BB0_294;
$L__BB0_297:
mov.f32 %f687, 0f00000000;
@%p8 bra $L__BB0_300;
setp.lt.u32 %p195, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p195 bra $L__BB0_300;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_300:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f687;}
// end inline asm
@%p8 bra $L__BB0_303;
mul.lo.s32 %r174, %r131, %r596;
add.s32 %r547, %r130, %r174;
setp.ge.s32 %p197, %r547, %r213;
@%p197 bra $L__BB0_303;
add.s32 %r548, %r132, %r174;
mul.wide.s32 %rd232, %r548, 2;
add.s64 %rd233, %rd31, %rd232;
st.global.v2.u16 [%rd233], {%rs131, %rs132};
bra.uni $L__BB0_303;
$L__BB0_241:
setp.lt.s32 %p160, %r127, 1;
@%p160 bra $L__BB0_273;
div.s32 %r136, %r128, %r3;
mad.lo.s32 %r137, %r213, %r5, %r129;
shl.b32 %r138, %r124, 1;
shl.b32 %r139, %r11, 1;
mul.lo.s32 %r140, %r213, %r3;
mov.u32 %r597, 0;
$L__BB0_243:
.pragma "nounroll";
setp.lt.s32 %p161, %r136, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_249;
mad.lo.s32 %r142, %r131, %r597, %r130;
mad.lo.s32 %r525, %r139, %r597, %r138;
mad.lo.s32 %r599, %r4, %r525, %r137;
mov.u32 %r524, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r598, %r5;
mov.u32 %r600, %r524;
$L__BB0_245:
.pragma "nounroll";
setp.ge.s32 %p162, %r142, %r213;
mov.u32 %r601, %r524;
mov.u32 %r602, %r524;
@%p162 bra $L__BB0_248;
setp.ge.s32 %p163, %r598, %r11;
mov.u32 %r601, %r524;
mov.u32 %r602, %r524;
@%p163 bra $L__BB0_248;
mul.wide.s32 %rd215, %r599, 4;
add.s64 %rd214, %rd42, %rd215;
// begin inline asm
ld.volatile.global.v2.s32 {%r602,%r601}, [%rd214];
// end inline asm
$L__BB0_248:
mov.b32 %f558, %r602;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r601;
add.f32 %f678, %f678, %f559;
add.s32 %r599, %r599, %r140;
add.s32 %r598, %r598, %r3;
add.s32 %r600, %r600, 1;
setp.lt.s32 %p164, %r600, %r136;
@%p164 bra $L__BB0_245;
$L__BB0_249:
st.shared.f32 [%rd23], %f679;
bar.sync 0;
@%p184 bra $L__BB0_251;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_251:
setp.lt.s32 %p166, %r133, 4;
bar.sync 0;
@%p166 bra $L__BB0_256;
mov.u32 %r603, %r134;
$L__BB0_253:
setp.ge.u32 %p167, %r5, %r603;
@%p167 bra $L__BB0_255;
add.s32 %r532, %r603, %r50;
mul.wide.s32 %rd216, %r532, 4;
add.s64 %rd218, %rd45, %rd216;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd218];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_255:
bar.sync 0;
shr.u32 %r155, %r603, 1;
setp.gt.u32 %p168, %r603, 3;
mov.u32 %r603, %r155;
@%p168 bra $L__BB0_253;
$L__BB0_256:
mov.f32 %f680, 0f00000000;
@%p8 bra $L__BB0_259;
setp.lt.u32 %p170, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p170 bra $L__BB0_259;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_259:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p184 bra $L__BB0_261;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_261:
bar.sync 0;
@%p166 bra $L__BB0_266;
mov.u32 %r604, %r134;
$L__BB0_263:
setp.ge.u32 %p173, %r5, %r604;
@%p173 bra $L__BB0_265;
add.s32 %r533, %r604, %r50;
mul.wide.s32 %rd219, %r533, 4;
add.s64 %rd221, %rd45, %rd219;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd221];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_265:
bar.sync 0;
shr.u32 %r157, %r604, 1;
setp.gt.u32 %p174, %r604, 3;
mov.u32 %r604, %r157;
@%p174 bra $L__BB0_263;
$L__BB0_266:
mov.f32 %f681, 0f00000000;
@%p8 bra $L__BB0_269;
setp.lt.u32 %p176, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p176 bra $L__BB0_269;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_269:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f681;}
// end inline asm
@%p8 bra $L__BB0_272;
mul.lo.s32 %r158, %r131, %r597;
add.s32 %r534, %r130, %r158;
setp.ge.s32 %p178, %r534, %r213;
@%p178 bra $L__BB0_272;
add.s32 %r535, %r132, %r158;
mul.wide.s32 %rd222, %r535, 2;
add.s64 %rd223, %rd32, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_272:
add.s32 %r597, %r597, 1;
setp.lt.s32 %p179, %r597, %r127;
@%p179 bra $L__BB0_243;
$L__BB0_273:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<200>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<604>;
.reg .f64 %fd<3>;
.reg .b64 %rd<237>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r211, %r212}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r221, %r222}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r225, %r226}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r247, %r212, 7;
shr.s32 %r248, %r247, 31;
shr.u32 %r249, %r248, 29;
add.s32 %r250, %r247, %r249;
shr.s32 %r2, %r250, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_2;
mov.u32 %r251, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r251;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd44, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r252, [%rd44], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r253, %r4, %r2;
shl.b32 %r254, %r253, 4;
or.b32 %r255, %r254, 15;
and.b32 %r7, %r255, -16;
add.s32 %r256, %r255, %r7;
and.b32 %r257, %r256, -16;
cvt.s64.s32 %rd1, %r257;
shl.b32 %r258, %r4, 2;
max.s32 %r259, %r2, %r3;
mad.lo.s32 %r260, %r258, %r259, 15;
and.b32 %r261, %r260, -16;
cvt.u64.u32 %rd2, %r261;
mov.u64 %rd45, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_72335arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r262, %r8, 7;
setp.lt.s32 %p9, %r262, %r212;
setp.lt.s32 %p10, %r5, %r2;
and.pred %p1, %p9, %p10;
not.pred %p11, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p12, %r9, 0;
or.pred %p13, %p12, %p11;
@%p13 bra $L__BB0_4;
add.s64 %rd47, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r263, smem_ptr; }
// end inline asm
shl.b32 %r266, %r5, 4;
add.s32 %r264, %r263, %r266;
mul.wide.s32 %rd49, %r8, 2;
add.s64 %rd48, %rd37, %rd49;
mov.u32 %r265, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r265, 0;
cp.async.ca.shared.global [%r264], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r556, %r6, 4;
add.s32 %r267, %r4, 215;
div.s32 %r268, %r267, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r269, %r11, %r268;
add.s32 %r270, %r269, -1;
div.s32 %r12, %r270, %r11;
setp.gt.s32 %p14, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p14 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r212;
cvt.s64.s32 %rd50, %r7;
add.s64 %rd51, %rd50, %rd2;
add.s64 %rd53, %rd45, %rd2;
mov.u32 %r272, %ctaid.y;
mul.lo.s32 %r273, %r12, %r4;
mul.lo.s32 %r13, %r273, %r272;
mad.lo.s32 %r274, %r2, %r9, %r5;
shl.b32 %r14, %r274, 4;
mul.lo.s32 %r275, %r212, %r9;
cvt.s64.s32 %rd54, %r275;
cvt.s64.s32 %rd55, %r8;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r276, %r13, %r212;
cvt.s64.s32 %rd6, %r276;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r277, %tid.z;
mad.lo.s32 %r278, %r4, %r277, %r9;
mad.lo.s32 %r15, %r278, %r3, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd7, %rd45, %rd56;
clz.b32 %r279, %r3;
mov.u32 %r280, 31;
sub.s32 %r281, %r280, %r279;
mov.u32 %r282, 1;
shl.b32 %r16, %r282, %r281;
setp.lt.u32 %p15, %r5, %r16;
add.s32 %r283, %r16, %r5;
setp.lt.u32 %p16, %r283, %r3;
and.pred %p2, %p15, %p16;
add.s32 %r284, %r15, %r16;
mul.wide.s32 %rd57, %r284, 4;
add.s64 %rd8, %rd45, %rd57;
shr.u32 %r285, %r16, 31;
add.s32 %r286, %r16, %r285;
shr.s32 %r17, %r286, 1;
shl.b32 %r287, %r9, 3;
mad.lo.s32 %r288, %r287, %r2, %r8;
add.s64 %rd58, %rd45, %rd51;
mul.wide.s32 %rd59, %r288, 2;
add.s64 %rd9, %rd58, %rd59;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r8, 2;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r289, %r15, 1;
mul.wide.u32 %rd62, %r289, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd59;
mul.wide.s32 %rd63, %r278, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd46, %rd51;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd35;
mov.u32 %r553, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r292, smem_ptr; }
// end inline asm
add.s32 %r293, %r292, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r295, smem_ptr; }
// end inline asm
add.s32 %r296, %r295, %r14;
not.pred %p22, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r553, %r4;
add.s32 %r290, %r22, %r9;
add.s32 %r23, %r290, %r13;
setp.gt.s32 %p17, %r23, 215;
mov.f32 %f622, %f187;
@%p17 bra $L__BB0_9;
mul.lo.s32 %r291, %r23, %r221;
mul.wide.s32 %rd65, %r291, 4;
add.s64 %rd66, %rd17, %rd65;
ld.global.f32 %f622, [%rd66];
$L__BB0_9:
setp.lt.s32 %p18, %r23, 216;
and.pred %p3, %p1, %p18;
not.pred %p19, %p3;
@%p19 bra $L__BB0_11;
mul.lo.s32 %r551, %r553, %r4;
mul.lo.s32 %r298, %r551, %r212;
cvt.s64.s32 %rd71, %r298;
add.s64 %rd72, %rd5, %rd71;
add.s64 %rd73, %rd72, %rd6;
shl.b64 %rd74, %rd73, 1;
add.s64 %rd68, %rd34, %rd74;
mov.u32 %r297, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r293], [%rd68], 16, p0;
}
// end inline asm
add.s64 %rd70, %rd33, %rd74;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r296], [%rd70], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r550, %r553, %r4;
add.s32 %r549, %r550, %r9;
add.s32 %r548, %r549, %r13;
setp.gt.s32 %p199, %r548, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p199 bra $L__BB0_13;
mul.lo.s32 %r299, %r23, %r225;
mul.wide.s32 %rd75, %r299, 4;
add.s64 %rd76, %rd16, %rd75;
ld.global.f32 %f623, [%rd76];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r304, %r305, %r306, %r307}, [%rd9];
ld.shared.v4.u32 {%r312, %r313, %r314, %r315}, [%rd10];
ld.shared.v4.u32 {%r320, %r321, %r322, %r323}, [%rd12];
mov.b32 {%rs36, %rs39}, %r320;
// begin inline asm
{ cvt.f32.f16 %f221, %rs36;}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r312;
// begin inline asm
{ cvt.f32.f16 %f222, %rs37;}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r304;
// begin inline asm
{ cvt.f32.f16 %f223, %rs38;}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f224, %rs39;}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ cvt.f32.f16 %f225, %rs40;}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ cvt.f32.f16 %f226, %rs41;}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r321;
// begin inline asm
{ cvt.f32.f16 %f227, %rs42;}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r313;
// begin inline asm
{ cvt.f32.f16 %f228, %rs43;}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r305;
// begin inline asm
{ cvt.f32.f16 %f229, %rs44;}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ cvt.f32.f16 %f230, %rs45;}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ cvt.f32.f16 %f231, %rs46;}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ cvt.f32.f16 %f232, %rs47;}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r322;
// begin inline asm
{ cvt.f32.f16 %f233, %rs48;}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r314;
// begin inline asm
{ cvt.f32.f16 %f234, %rs49;}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r306;
// begin inline asm
{ cvt.f32.f16 %f235, %rs50;}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ cvt.f32.f16 %f236, %rs51;}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ cvt.f32.f16 %f237, %rs52;}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ cvt.f32.f16 %f238, %rs53;}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r323;
// begin inline asm
{ cvt.f32.f16 %f239, %rs54;}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r315;
// begin inline asm
{ cvt.f32.f16 %f240, %rs55;}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r307;
// begin inline asm
{ cvt.f32.f16 %f241, %rs56;}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ cvt.f32.f16 %f242, %rs57;}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ cvt.f32.f16 %f243, %rs58;}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ cvt.f32.f16 %f244, %rs59;}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r556, %r556, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p22 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p23, %r16, 4;
bar.sync 0;
@%p23 bra $L__BB0_23;
mov.u32 %r554, %r17;
$L__BB0_20:
setp.ge.u32 %p24, %r5, %r554;
@%p24 bra $L__BB0_22;
add.s32 %r328, %r554, %r15;
mul.wide.s32 %rd77, %r328, 4;
add.s64 %rd79, %rd45, %rd77;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd79];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r35, %r554, 1;
setp.gt.u32 %p25, %r554, 3;
mov.u32 %r554, %r35;
@%p25 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p8 bra $L__BB0_26;
setp.lt.u32 %p27, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p27 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p22 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p198, %r16, 4;
bar.sync 0;
@%p198 bra $L__BB0_33;
mov.u32 %r555, %r17;
$L__BB0_30:
setp.ge.u32 %p30, %r5, %r555;
@%p30 bra $L__BB0_32;
add.s32 %r329, %r555, %r15;
mul.wide.s32 %rd80, %r329, 4;
add.s64 %rd82, %rd45, %rd80;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd82];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r37, %r555, 1;
setp.gt.u32 %p31, %r555, 3;
mov.u32 %r555, %r37;
@%p31 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p8 bra $L__BB0_36;
setp.lt.u32 %p33, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p33 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p8 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p8 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
ld.shared.v4.u32 {%r338, %r339, %r340, %r341}, [%rd9];
ld.shared.v4.u32 {%r346, %r347, %r348, %r349}, [%rd10];
ld.shared.v4.u32 {%r354, %r355, %r356, %r357}, [%rd12];
mov.b32 {%rs97, %rs101}, %r346;
// begin inline asm
{ cvt.f32.f16 %f338, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r354;
// begin inline asm
{ cvt.f32.f16 %f339, %rs98;}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r338;
// begin inline asm
{ cvt.f32.f16 %f340, %rs99;}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ cvt.f32.f16 %f342, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f343, %rs102;}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ cvt.f32.f16 %f344, %rs103;}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r334, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r347;
// begin inline asm
{ cvt.f32.f16 %f346, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r355;
// begin inline asm
{ cvt.f32.f16 %f347, %rs106;}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r339;
// begin inline asm
{ cvt.f32.f16 %f348, %rs107;}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ cvt.f32.f16 %f350, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f351, %rs110;}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ cvt.f32.f16 %f352, %rs111;}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r335, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r348;
// begin inline asm
{ cvt.f32.f16 %f354, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r356;
// begin inline asm
{ cvt.f32.f16 %f355, %rs114;}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r340;
// begin inline asm
{ cvt.f32.f16 %f356, %rs115;}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ cvt.f32.f16 %f358, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f359, %rs118;}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ cvt.f32.f16 %f360, %rs119;}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r336, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r349;
// begin inline asm
{ cvt.f32.f16 %f362, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r357;
// begin inline asm
{ cvt.f32.f16 %f363, %rs122;}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r341;
// begin inline asm
{ cvt.f32.f16 %f364, %rs123;}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ cvt.f32.f16 %f366, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f367, %rs126;}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ cvt.f32.f16 %f368, %rs127;}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r337, {%rs124, %rs128};
mad.lo.s32 %r362, %r23, %r212, %r8;
mul.wide.s32 %rd84, %r362, 2;
add.s64 %rd83, %rd38, %rd84;
// begin inline asm
st.global.cs.v4.s32 [%rd83], {%r334,%r335,%r336,%r337};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r553, %r553, 1;
setp.lt.s32 %p37, %r553, %r12;
@%p37 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r363, %tid.z;
mad.lo.s32 %r364, %r4, %r363, %r9;
mad.lo.s32 %r49, %r364, %r3, %r5;
mul.wide.u32 %rd85, %r49, 4;
add.s64 %rd23, %rd45, %rd85;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r365, %r4;
mov.u32 %r366, 31;
sub.s32 %r50, %r366, %r365;
mov.u32 %r367, 1;
shl.b32 %r587, %r367, %r50;
setp.lt.u32 %p38, %r9, %r587;
add.s32 %r368, %r587, %r9;
setp.lt.u32 %p39, %r368, %r4;
and.pred %p4, %p38, %p39;
not.pred %p40, %p4;
@%p40 bra $L__BB0_46;
shl.b32 %r369, %r3, %r50;
add.s32 %r370, %r49, %r369;
mul.wide.s32 %rd87, %r370, 4;
add.s64 %rd89, %rd45, %rd87;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd89];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p41, %r587, 4;
@%p41 bra $L__BB0_51;
mov.u32 %r557, %r587;
$L__BB0_48:
shr.u32 %r53, %r557, 1;
setp.ge.u32 %p42, %r9, %r53;
@%p42 bra $L__BB0_50;
mad.lo.s32 %r371, %r53, %r3, %r49;
mul.wide.s32 %rd90, %r371, 4;
add.s64 %rd92, %rd45, %rd90;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd92];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p43, %r557, 7;
mov.u32 %r557, %r53;
@%p43 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r558, 0;
add.s32 %r373, %r49, %r3;
mul.wide.u32 %rd93, %r373, 4;
add.s64 %rd24, %rd45, %rd93;
@%p12 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p45, %r4, 2;
@%p45 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r558, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p40 bra $L__BB0_57;
shl.b32 %r374, %r3, %r50;
add.s32 %r375, %r49, %r374;
mul.wide.s32 %rd95, %r375, 4;
add.s64 %rd97, %rd45, %rd95;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd97];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p41 bra $L__BB0_62;
mov.u32 %r559, %r587;
$L__BB0_59:
shr.u32 %r57, %r559, 1;
setp.ge.u32 %p48, %r9, %r57;
@%p48 bra $L__BB0_61;
mad.lo.s32 %r376, %r57, %r3, %r49;
mul.wide.s32 %rd98, %r376, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd100];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p49, %r559, 7;
mov.u32 %r559, %r57;
@%p49 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r560, 0;
@%p12 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p51, %r4, 2;
@%p51 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r560, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p40 bra $L__BB0_68;
shl.b32 %r378, %r3, %r50;
add.s32 %r379, %r49, %r378;
mul.wide.s32 %rd101, %r379, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd103];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p41 bra $L__BB0_73;
mov.u32 %r561, %r587;
$L__BB0_70:
shr.u32 %r61, %r561, 1;
setp.ge.u32 %p54, %r9, %r61;
@%p54 bra $L__BB0_72;
mad.lo.s32 %r380, %r61, %r3, %r49;
mul.wide.s32 %rd104, %r380, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd106];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p55, %r561, 7;
mov.u32 %r561, %r61;
@%p55 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r562, 0;
@%p12 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r562, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p40 bra $L__BB0_79;
shl.b32 %r382, %r3, %r50;
add.s32 %r383, %r49, %r382;
mul.wide.s32 %rd107, %r383, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd109];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p41 bra $L__BB0_84;
mov.u32 %r563, %r587;
$L__BB0_81:
shr.u32 %r65, %r563, 1;
setp.ge.u32 %p60, %r9, %r65;
@%p60 bra $L__BB0_83;
mad.lo.s32 %r384, %r65, %r3, %r49;
mul.wide.s32 %rd110, %r384, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd112];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p61, %r563, 7;
mov.u32 %r563, %r65;
@%p61 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r564, 0;
@%p12 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r564, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p40 bra $L__BB0_90;
shl.b32 %r386, %r3, %r50;
add.s32 %r387, %r49, %r386;
mul.wide.s32 %rd113, %r387, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd115];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p41 bra $L__BB0_95;
mov.u32 %r565, %r587;
$L__BB0_92:
shr.u32 %r69, %r565, 1;
setp.ge.u32 %p66, %r9, %r69;
@%p66 bra $L__BB0_94;
mad.lo.s32 %r388, %r69, %r3, %r49;
mul.wide.s32 %rd116, %r388, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd118];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p67, %r565, 7;
mov.u32 %r565, %r69;
@%p67 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r566, 0;
@%p12 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r566, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p40 bra $L__BB0_101;
shl.b32 %r390, %r3, %r50;
add.s32 %r391, %r49, %r390;
mul.wide.s32 %rd119, %r391, 4;
add.s64 %rd121, %rd45, %rd119;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd121];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p41 bra $L__BB0_106;
mov.u32 %r567, %r587;
$L__BB0_103:
shr.u32 %r73, %r567, 1;
setp.ge.u32 %p72, %r9, %r73;
@%p72 bra $L__BB0_105;
mad.lo.s32 %r392, %r73, %r3, %r49;
mul.wide.s32 %rd122, %r392, 4;
add.s64 %rd124, %rd45, %rd122;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd124];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p73, %r567, 7;
mov.u32 %r567, %r73;
@%p73 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r568, 0;
@%p12 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r568, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p40 bra $L__BB0_112;
shl.b32 %r394, %r3, %r50;
add.s32 %r395, %r49, %r394;
mul.wide.s32 %rd125, %r395, 4;
add.s64 %rd127, %rd45, %rd125;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd127];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p41 bra $L__BB0_117;
mov.u32 %r569, %r587;
$L__BB0_114:
shr.u32 %r77, %r569, 1;
setp.ge.u32 %p78, %r9, %r77;
@%p78 bra $L__BB0_116;
mad.lo.s32 %r396, %r77, %r3, %r49;
mul.wide.s32 %rd128, %r396, 4;
add.s64 %rd130, %rd45, %rd128;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd130];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p79, %r569, 7;
mov.u32 %r569, %r77;
@%p79 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r570, 0;
@%p12 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r570, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p40 bra $L__BB0_123;
shl.b32 %r398, %r3, %r50;
add.s32 %r399, %r49, %r398;
mul.wide.s32 %rd131, %r399, 4;
add.s64 %rd133, %rd45, %rd131;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd133];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p41 bra $L__BB0_128;
mov.u32 %r571, %r587;
$L__BB0_125:
shr.u32 %r81, %r571, 1;
setp.ge.u32 %p84, %r9, %r81;
@%p84 bra $L__BB0_127;
mad.lo.s32 %r400, %r81, %r3, %r49;
mul.wide.s32 %rd134, %r400, 4;
add.s64 %rd136, %rd45, %rd134;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd136];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p85, %r571, 7;
mov.u32 %r571, %r81;
@%p85 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r572, 0;
@%p12 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r572, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r84, %r556, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p40 bra $L__BB0_134;
shl.b32 %r402, %r3, %r50;
add.s32 %r403, %r49, %r402;
mul.wide.s32 %rd137, %r403, 4;
add.s64 %rd139, %rd45, %rd137;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd139];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p41 bra $L__BB0_139;
mov.u32 %r573, %r587;
$L__BB0_136:
shr.u32 %r86, %r573, 1;
setp.ge.u32 %p90, %r9, %r86;
@%p90 bra $L__BB0_138;
mad.lo.s32 %r404, %r86, %r3, %r49;
mul.wide.s32 %rd140, %r404, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd142];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p91, %r573, 7;
mov.u32 %r573, %r86;
@%p91 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r574, 0;
@%p12 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r574, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p40 bra $L__BB0_145;
shl.b32 %r406, %r3, %r50;
add.s32 %r407, %r49, %r406;
mul.wide.s32 %rd143, %r407, 4;
add.s64 %rd145, %rd45, %rd143;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd145];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p41 bra $L__BB0_150;
mov.u32 %r575, %r587;
$L__BB0_147:
shr.u32 %r90, %r575, 1;
setp.ge.u32 %p96, %r9, %r90;
@%p96 bra $L__BB0_149;
mad.lo.s32 %r408, %r90, %r3, %r49;
mul.wide.s32 %rd146, %r408, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd148];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p97, %r575, 7;
mov.u32 %r575, %r90;
@%p97 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r576, 0;
@%p12 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r576, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p40 bra $L__BB0_156;
shl.b32 %r410, %r3, %r50;
add.s32 %r411, %r49, %r410;
mul.wide.s32 %rd149, %r411, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd151];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p41 bra $L__BB0_161;
mov.u32 %r577, %r587;
$L__BB0_158:
shr.u32 %r94, %r577, 1;
setp.ge.u32 %p102, %r9, %r94;
@%p102 bra $L__BB0_160;
mad.lo.s32 %r412, %r94, %r3, %r49;
mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd154];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p103, %r577, 7;
mov.u32 %r577, %r94;
@%p103 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r578, 0;
@%p12 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r578, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p40 bra $L__BB0_167;
shl.b32 %r414, %r3, %r50;
add.s32 %r415, %r49, %r414;
mul.wide.s32 %rd155, %r415, 4;
add.s64 %rd157, %rd45, %rd155;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd157];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p41 bra $L__BB0_172;
mov.u32 %r579, %r587;
$L__BB0_169:
shr.u32 %r98, %r579, 1;
setp.ge.u32 %p108, %r9, %r98;
@%p108 bra $L__BB0_171;
mad.lo.s32 %r416, %r98, %r3, %r49;
mul.wide.s32 %rd158, %r416, 4;
add.s64 %rd160, %rd45, %rd158;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd160];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p109, %r579, 7;
mov.u32 %r579, %r98;
@%p109 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r580, 0;
@%p12 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r580, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p40 bra $L__BB0_178;
shl.b32 %r418, %r3, %r50;
add.s32 %r419, %r49, %r418;
mul.wide.s32 %rd161, %r419, 4;
add.s64 %rd163, %rd45, %rd161;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd163];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p41 bra $L__BB0_183;
mov.u32 %r581, %r587;
$L__BB0_180:
shr.u32 %r102, %r581, 1;
setp.ge.u32 %p114, %r9, %r102;
@%p114 bra $L__BB0_182;
mad.lo.s32 %r420, %r102, %r3, %r49;
mul.wide.s32 %rd164, %r420, 4;
add.s64 %rd166, %rd45, %rd164;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd166];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p115, %r581, 7;
mov.u32 %r581, %r102;
@%p115 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r582, 0;
@%p12 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r582, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p40 bra $L__BB0_189;
shl.b32 %r422, %r3, %r50;
add.s32 %r423, %r49, %r422;
mul.wide.s32 %rd167, %r423, 4;
add.s64 %rd169, %rd45, %rd167;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd169];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p41 bra $L__BB0_194;
mov.u32 %r583, %r587;
$L__BB0_191:
shr.u32 %r106, %r583, 1;
setp.ge.u32 %p120, %r9, %r106;
@%p120 bra $L__BB0_193;
mad.lo.s32 %r424, %r106, %r3, %r49;
mul.wide.s32 %rd170, %r424, 4;
add.s64 %rd172, %rd45, %rd170;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd172];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p121, %r583, 7;
mov.u32 %r583, %r106;
@%p121 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r584, 0;
@%p12 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r584, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p40 bra $L__BB0_200;
shl.b32 %r426, %r3, %r50;
add.s32 %r427, %r49, %r426;
mul.wide.s32 %rd173, %r427, 4;
add.s64 %rd175, %rd45, %rd173;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd175];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p41 bra $L__BB0_205;
mov.u32 %r585, %r587;
$L__BB0_202:
shr.u32 %r110, %r585, 1;
setp.ge.u32 %p126, %r9, %r110;
@%p126 bra $L__BB0_204;
mad.lo.s32 %r428, %r110, %r3, %r49;
mul.wide.s32 %rd176, %r428, 4;
add.s64 %rd178, %rd45, %rd176;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd178];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p127, %r585, 7;
mov.u32 %r585, %r110;
@%p127 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r586, 0;
@%p12 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r586, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p40 bra $L__BB0_211;
shl.b32 %r430, %r3, %r50;
add.s32 %r431, %r49, %r430;
mul.wide.s32 %rd179, %r431, 4;
add.s64 %rd181, %rd45, %rd179;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd181];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p41 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r114, %r587, 1;
setp.ge.u32 %p132, %r9, %r114;
@%p132 bra $L__BB0_214;
mad.lo.s32 %r432, %r114, %r3, %r49;
mul.wide.s32 %rd182, %r432, 4;
add.s64 %rd184, %rd45, %rd182;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd184];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p133, %r587, 7;
mov.u32 %r587, %r114;
@%p133 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r588, 0;
@%p12 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r588, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p12 bra $L__BB0_226;
mov.u32 %r458, %ctaid.y;
mad.lo.s32 %r459, %r212, %r458, %r8;
add.s32 %r460, %r459, %r84;
mul.wide.s32 %rd191, %r460, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r558,%r560,%r562,%r564};
// end inline asm
add.s32 %r461, %r460, 4;
mul.wide.s32 %rd192, %r461, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r566,%r568,%r570,%r572};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
setp.eq.s32 %p136, %r9, 0;
and.pred %p5, %p136, %p10;
not.pred %p138, %p5;
add.s32 %r434, %r8, 3;
sub.s32 %r117, %r434, %r212;
mov.u32 %r435, %ctaid.y;
mad.lo.s32 %r118, %r212, %r435, %r8;
neg.s32 %r436, %r84;
setp.ge.s32 %p139, %r117, %r436;
or.pred %p140, %p138, %p139;
@%p140 bra $L__BB0_222;
add.s32 %r441, %r118, %r84;
mul.wide.s32 %rd186, %r441, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r558,%r560,%r562,%r564};
// end inline asm
$L__BB0_222:
mov.u32 %r442, -4;
sub.s32 %r443, %r442, %r84;
setp.ge.s32 %p141, %r117, %r443;
or.pred %p143, %p138, %p141;
@%p143 bra $L__BB0_226;
add.s32 %r448, %r118, %r84;
add.s32 %r449, %r448, 4;
mul.wide.s32 %rd188, %r449, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r566,%r568,%r570,%r572};
// end inline asm
$L__BB0_226:
@%p1 bra $L__BB0_233;
bra.uni $L__BB0_227;
$L__BB0_233:
@%p12 bra $L__BB0_235;
shl.b32 %r486, %r556, 5;
mov.u32 %r487, %ctaid.y;
mad.lo.s32 %r488, %r212, %r487, %r8;
add.s32 %r489, %r488, %r486;
mul.wide.s32 %rd199, %r489, 4;
add.s64 %rd197, %rd42, %rd199;
// begin inline asm
st.volatile.global.v4.s32 [%rd197], {%r574,%r576,%r578,%r580};
// end inline asm
add.s32 %r490, %r489, 4;
mul.wide.s32 %rd200, %r490, 4;
add.s64 %rd198, %rd42, %rd200;
// begin inline asm
st.volatile.global.v4.s32 [%rd198], {%r582,%r584,%r586,%r588};
// end inline asm
bra.uni $L__BB0_235;
$L__BB0_227:
setp.eq.s32 %p145, %r9, 0;
and.pred %p6, %p145, %p10;
add.s32 %r462, %r8, 3;
sub.s32 %r119, %r462, %r212;
mov.u32 %r463, %ctaid.y;
mad.lo.s32 %r120, %r212, %r463, %r8;
not.pred %p147, %p6;
@%p147 bra $L__BB0_230;
shl.b32 %r121, %r556, 5;
neg.s32 %r464, %r121;
setp.ge.s32 %p148, %r119, %r464;
@%p148 bra $L__BB0_230;
add.s32 %r469, %r120, %r121;
mul.wide.s32 %rd194, %r469, 4;
add.s64 %rd193, %rd42, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r574,%r576,%r578,%r580};
// end inline asm
$L__BB0_230:
@%p147 bra $L__BB0_235;
shl.b32 %r122, %r556, 5;
mov.u32 %r470, -4;
sub.s32 %r471, %r470, %r122;
setp.ge.s32 %p150, %r119, %r471;
@%p150 bra $L__BB0_235;
add.s32 %r476, %r120, %r122;
add.s32 %r477, %r476, 4;
mul.wide.s32 %rd196, %r477, 4;
add.s64 %rd195, %rd42, %rd196;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r582,%r584,%r586,%r588};
// end inline asm
$L__BB0_235:
mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r491, %r5, %r9;
or.b32 %r493, %r491, %r363;
setp.ne.s32 %p152, %r493, 0;
@%p152 bra $L__BB0_239;
ld.param.u64 %rd236, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd201, %rd236;
mov.u32 %r494, %ctaid.x;
mov.u32 %r495, %ctaid.z;
mov.u32 %r496, %nctaid.x;
mad.lo.s32 %r497, %r495, %r496, %r494;
mul.wide.s32 %rd202, %r497, 8;
add.s64 %rd27, %rd201, %rd202;
add.s32 %r498, %r11, -1;
setp.eq.s32 %p153, %r123, %r498;
cvt.s64.s32 %rd203, %r11;
mov.u64 %rd204, -9223372036854775807;
sub.s64 %rd205, %rd204, %rd203;
selp.b64 %rd206, %rd205, 1, %p153;
atom.global.add.u64 %rd28, [%rd27], %rd206;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.lt.s64 %p154, %rd208, 0;
@%p154 bra $L__BB0_239;
mov.u32 %r589, 8;
$L__BB0_238:
// begin inline asm
nanosleep.u32 %r589;
// end inline asm
setp.lt.u32 %p155, %r589, 256;
selp.u32 %r501, 1, 0, %p155;
shl.b32 %r589, %r589, %r501;
ld.volatile.global.u64 %rd209, [%rd27];
xor.b64 %rd210, %rd209, %rd28;
setp.gt.s64 %p156, %rd210, -1;
@%p156 bra $L__BB0_238;
$L__BB0_239:
ld.param.u64 %rd235, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd234, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_30_cu_83dc6796_723310nvfuser_30ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
mov.u32 %r503, 1;
add.s32 %r504, %r212, 1;
shr.u32 %r505, %r504, 31;
add.s32 %r506, %r504, %r505;
shr.s32 %r507, %r506, 1;
add.s32 %r508, %r4, %r507;
add.s32 %r509, %r508, -1;
div.s32 %r510, %r509, %r4;
add.s32 %r511, %r11, -1;
add.s32 %r512, %r511, %r510;
div.s32 %r126, %r512, %r11;
add.s32 %r127, %r511, %r3;
shl.b32 %r128, %r9, 1;
shl.b32 %r513, %r4, 1;
mad.lo.s32 %r131, %r513, %r123, %r128;
or.b32 %r129, %r131, 1;
mul.lo.s32 %r130, %r513, %r11;
clz.b32 %r514, %r3;
mov.u32 %r515, 31;
sub.s32 %r516, %r515, %r514;
shl.b32 %r132, %r503, %r516;
setp.lt.u32 %p157, %r5, %r132;
add.s32 %r517, %r132, %r5;
setp.lt.u32 %p158, %r517, %r3;
and.pred %p7, %p157, %p158;
add.s32 %r518, %r49, %r132;
mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd45, %rd211;
shr.u32 %r519, %r132, 31;
add.s32 %r520, %r132, %r519;
shr.s32 %r133, %r520, 1;
add.s32 %r521, %r49, 1;
mul.wide.u32 %rd213, %r521, 4;
add.s64 %rd30, %rd45, %rd213;
cvta.to.global.u64 %rd31, %rd234;
cvta.to.global.u64 %rd32, %rd235;
mov.u32 %r590, 0;
not.pred %p184, %p7;
bra.uni $L__BB0_240;
$L__BB0_303:
add.s32 %r590, %r590, 1;
$L__BB0_240:
.pragma "nounroll";
setp.lt.s32 %p159, %r590, %r126;
@%p159 bra $L__BB0_274;
bra.uni $L__BB0_241;
$L__BB0_274:
div.s32 %r159, %r127, %r3;
setp.lt.s32 %p180, %r159, 1;
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p180 bra $L__BB0_280;
mul.lo.s32 %r536, %r130, %r590;
add.s32 %r160, %r129, %r536;
add.s32 %r161, %r131, %r536;
mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r599, %r535;
$L__BB0_276:
.pragma "nounroll";
setp.ge.s32 %p181, %r160, %r212;
mov.u32 %r600, %r535;
mov.u32 %r601, %r535;
@%p181 bra $L__BB0_279;
mad.lo.s32 %r163, %r599, %r3, %r5;
setp.ge.s32 %p182, %r163, %r11;
mov.u32 %r600, %r535;
mov.u32 %r601, %r535;
@%p182 bra $L__BB0_279;
mad.lo.s32 %r543, %r163, %r212, %r161;
mul.wide.s32 %rd225, %r543, 4;
add.s64 %rd224, %rd41, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r601,%r600}, [%rd224];
// end inline asm
$L__BB0_279:
mov.b32 %f584, %r601;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r600;
add.f32 %f684, %f684, %f585;
add.s32 %r599, %r599, 1;
setp.lt.s32 %p183, %r599, %r159;
@%p183 bra $L__BB0_276;
$L__BB0_280:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p184 bra $L__BB0_282;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_282:
setp.lt.s32 %p185, %r132, 4;
bar.sync 0;
@%p185 bra $L__BB0_287;
mov.u32 %r602, %r133;
$L__BB0_284:
setp.ge.u32 %p186, %r5, %r602;
@%p186 bra $L__BB0_286;
add.s32 %r544, %r602, %r49;
mul.wide.s32 %rd226, %r544, 4;
add.s64 %rd228, %rd45, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_286:
bar.sync 0;
shr.u32 %r170, %r602, 1;
setp.gt.u32 %p187, %r602, 3;
mov.u32 %r602, %r170;
@%p187 bra $L__BB0_284;
$L__BB0_287:
mov.f32 %f686, 0f00000000;
@%p8 bra $L__BB0_290;
setp.lt.u32 %p189, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p189 bra $L__BB0_290;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_290:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p184 bra $L__BB0_292;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_292:
bar.sync 0;
@%p185 bra $L__BB0_297;
mov.u32 %r603, %r133;
$L__BB0_294:
setp.ge.u32 %p192, %r5, %r603;
@%p192 bra $L__BB0_296;
add.s32 %r545, %r603, %r49;
mul.wide.s32 %rd229, %r545, 4;
add.s64 %rd231, %rd45, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_296:
bar.sync 0;
shr.u32 %r172, %r603, 1;
setp.gt.u32 %p193, %r603, 3;
mov.u32 %r603, %r172;
@%p193 bra $L__BB0_294;
$L__BB0_297:
mov.f32 %f687, 0f00000000;
@%p8 bra $L__BB0_300;
setp.lt.u32 %p195, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p195 bra $L__BB0_300;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_300:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f687;}
// end inline asm
@%p8 bra $L__BB0_303;
mul.lo.s32 %r173, %r130, %r590;
add.s32 %r546, %r129, %r173;
setp.ge.s32 %p197, %r546, %r212;
@%p197 bra $L__BB0_303;
add.s32 %r547, %r131, %r173;
mul.wide.s32 %rd232, %r547, 2;
add.s64 %rd233, %rd31, %rd232;
st.global.v2.u16 [%rd233], {%rs131, %rs132};
bra.uni $L__BB0_303;
$L__BB0_241:
setp.lt.s32 %p160, %r126, 1;
@%p160 bra $L__BB0_273;
div.s32 %r135, %r127, %r3;
mad.lo.s32 %r136, %r212, %r5, %r128;
shl.b32 %r137, %r123, 1;
shl.b32 %r138, %r11, 1;
mul.lo.s32 %r139, %r212, %r3;
mov.u32 %r591, 0;
$L__BB0_243:
.pragma "nounroll";
setp.lt.s32 %p161, %r135, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_249;
mad.lo.s32 %r141, %r130, %r591, %r129;
mad.lo.s32 %r524, %r138, %r591, %r137;
mad.lo.s32 %r593, %r4, %r524, %r136;
mov.u32 %r523, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r592, %r5;
mov.u32 %r594, %r523;
$L__BB0_245:
.pragma "nounroll";
setp.ge.s32 %p162, %r141, %r212;
mov.u32 %r595, %r523;
mov.u32 %r596, %r523;
@%p162 bra $L__BB0_248;
setp.ge.s32 %p163, %r592, %r11;
mov.u32 %r595, %r523;
mov.u32 %r596, %r523;
@%p163 bra $L__BB0_248;
mul.wide.s32 %rd215, %r593, 4;
add.s64 %rd214, %rd42, %rd215;
// begin inline asm
ld.volatile.global.v2.s32 {%r596,%r595}, [%rd214];
// end inline asm
$L__BB0_248:
mov.b32 %f558, %r596;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r595;
add.f32 %f678, %f678, %f559;
add.s32 %r593, %r593, %r139;
add.s32 %r592, %r592, %r3;
add.s32 %r594, %r594, 1;
setp.lt.s32 %p164, %r594, %r135;
@%p164 bra $L__BB0_245;
$L__BB0_249:
st.shared.f32 [%rd23], %f679;
bar.sync 0;
@%p184 bra $L__BB0_251;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_251:
setp.lt.s32 %p166, %r132, 4;
bar.sync 0;
@%p166 bra $L__BB0_256;
mov.u32 %r597, %r133;
$L__BB0_253:
setp.ge.u32 %p167, %r5, %r597;
@%p167 bra $L__BB0_255;
add.s32 %r531, %r597, %r49;
mul.wide.s32 %rd216, %r531, 4;
add.s64 %rd218, %rd45, %rd216;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd218];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_255:
bar.sync 0;
shr.u32 %r154, %r597, 1;
setp.gt.u32 %p168, %r597, 3;
mov.u32 %r597, %r154;
@%p168 bra $L__BB0_253;
$L__BB0_256:
mov.f32 %f680, 0f00000000;
@%p8 bra $L__BB0_259;
setp.lt.u32 %p170, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p170 bra $L__BB0_259;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_259:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p184 bra $L__BB0_261;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_261:
bar.sync 0;
@%p166 bra $L__BB0_266;
mov.u32 %r598, %r133;
$L__BB0_263:
setp.ge.u32 %p173, %r5, %r598;
@%p173 bra $L__BB0_265;
add.s32 %r532, %r598, %r49;
mul.wide.s32 %rd219, %r532, 4;
add.s64 %rd221, %rd45, %rd219;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd221];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_265:
bar.sync 0;
shr.u32 %r156, %r598, 1;
setp.gt.u32 %p174, %r598, 3;
mov.u32 %r598, %r156;
@%p174 bra $L__BB0_263;
$L__BB0_266:
mov.f32 %f681, 0f00000000;
@%p8 bra $L__BB0_269;
setp.lt.u32 %p176, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p176 bra $L__BB0_269;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_269:
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f681;}
// end inline asm
@%p8 bra $L__BB0_272;
mul.lo.s32 %r157, %r130, %r591;
add.s32 %r533, %r129, %r157;
setp.ge.s32 %p178, %r533, %r212;
@%p178 bra $L__BB0_272;
add.s32 %r534, %r131, %r157;
mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd32, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_272:
add.s32 %r591, %r591, 1;
setp.lt.s32 %p179, %r591, %r126;
@%p179 bra $L__BB0_243;
$L__BB0_273:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,173 +32,173 @@
)
{
.reg .pred %p<200>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
- .reg .b32 %r<610>;
+ .reg .b32 %r<604>;
.reg .f64 %fd<3>;
.reg .b64 %rd<237>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r212, %r213}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r222, %r223}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r226, %r227}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r221, %r222}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r225, %r226}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r248, %r213, 7;
- shr.s32 %r249, %r248, 31;
- shr.u32 %r250, %r249, 29;
- add.s32 %r251, %r248, %r250;
- shr.s32 %r2, %r251, 3;
+ add.s32 %r247, %r212, 7;
+ shr.s32 %r248, %r247, 31;
+ shr.u32 %r249, %r248, 29;
+ add.s32 %r250, %r247, %r249;
+ shr.s32 %r2, %r250, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_2;
- mov.u32 %r252, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r252;
+ mov.u32 %r251, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r251;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd44, _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r253, [%rd44], %r5;
+ atom.shared.min.s32 %r252, [%rd44], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r254, %r4, %r2;
- shl.b32 %r255, %r254, 4;
- or.b32 %r256, %r255, 15;
- and.b32 %r7, %r256, -16;
- add.s32 %r257, %r256, %r7;
- and.b32 %r258, %r257, -16;
- cvt.s64.s32 %rd1, %r258;
- shl.b32 %r259, %r4, 2;
- max.s32 %r260, %r2, %r3;
- mad.lo.s32 %r261, %r259, %r260, 15;
- and.b32 %r262, %r261, -16;
- cvt.u64.u32 %rd2, %r262;
+ mul.lo.s32 %r253, %r4, %r2;
+ shl.b32 %r254, %r253, 4;
+ or.b32 %r255, %r254, 15;
+ and.b32 %r7, %r255, -16;
+ add.s32 %r256, %r255, %r7;
+ and.b32 %r257, %r256, -16;
+ cvt.s64.s32 %rd1, %r257;
+ shl.b32 %r258, %r4, 2;
+ max.s32 %r259, %r2, %r3;
+ mad.lo.s32 %r260, %r258, %r259, 15;
+ and.b32 %r261, %r260, -16;
+ cvt.u64.u32 %rd2, %r261;
mov.u64 %rd45, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r263, %r8, 7;
- setp.lt.s32 %p9, %r263, %r213;
+ or.b32 %r262, %r8, 7;
+ setp.lt.s32 %p9, %r262, %r212;
setp.lt.s32 %p10, %r5, %r2;
and.pred %p1, %p9, %p10;
not.pred %p11, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p12, %r9, 0;
or.pred %p13, %p12, %p11;
@%p13 bra $L__BB0_4;
add.s64 %rd47, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r264, smem_ptr; }
-
-
- shl.b32 %r267, %r5, 4;
- add.s32 %r265, %r264, %r267;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r263, smem_ptr; }
+
+
+ shl.b32 %r266, %r5, 4;
+ add.s32 %r264, %r263, %r266;
mul.wide.s32 %rd49, %r8, 2;
add.s64 %rd48, %rd37, %rd49;
- mov.u32 %r266, 0;
+ mov.u32 %r265, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r266, 0;
- cp.async.ca.shared.global [%r265], [%rd48], 16, p0;
+ setp.ne.b32 p0, %r265, 0;
+ cp.async.ca.shared.global [%r264], [%rd48], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r562, %r6, 4;
- add.s32 %r268, %r4, 215;
- div.s32 %r269, %r268, %r4;
+ shl.b32 %r556, %r6, 4;
+ add.s32 %r267, %r4, 215;
+ div.s32 %r268, %r267, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r270, %r11, %r269;
- add.s32 %r271, %r270, -1;
- div.s32 %r12, %r271, %r11;
+ add.s32 %r269, %r11, %r268;
+ add.s32 %r270, %r269, -1;
+ div.s32 %r12, %r270, %r11;
setp.gt.s32 %p14, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p14 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r213;
+ cvt.rn.f64.s32 %fd1, %r212;
cvt.s64.s32 %rd50, %r7;
add.s64 %rd51, %rd50, %rd2;
add.s64 %rd53, %rd45, %rd2;
- mov.u32 %r273, %ctaid.y;
- mul.lo.s32 %r274, %r12, %r4;
- mul.lo.s32 %r13, %r274, %r273;
- shl.b32 %r275, %r9, 1;
- mov.u32 %r276, 1;
- shl.b32 %r277, %r5, 4;
- mad.lo.s32 %r14, %r275, %r213, %r277;
- mul.lo.s32 %r278, %r213, %r9;
- cvt.s64.s32 %rd54, %r278;
+ mov.u32 %r272, %ctaid.y;
+ mul.lo.s32 %r273, %r12, %r4;
+ mul.lo.s32 %r13, %r273, %r272;
+ mad.lo.s32 %r274, %r2, %r9, %r5;
+ shl.b32 %r14, %r274, 4;
+ mul.lo.s32 %r275, %r212, %r9;
+ cvt.s64.s32 %rd54, %r275;
cvt.s64.s32 %rd55, %r8;
add.s64 %rd5, %rd54, %rd55;
- mul.lo.s32 %r279, %r13, %r213;
- cvt.s64.s32 %rd6, %r279;
+ mul.lo.s32 %r276, %r13, %r212;
+ cvt.s64.s32 %rd6, %r276;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- mov.u32 %r280, %tid.z;
- mad.lo.s32 %r281, %r4, %r280, %r9;
- mad.lo.s32 %r15, %r281, %r3, %r5;
+ mov.u32 %r277, %tid.z;
+ mad.lo.s32 %r278, %r4, %r277, %r9;
+ mad.lo.s32 %r15, %r278, %r3, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd7, %rd45, %rd56;
- clz.b32 %r282, %r3;
- mov.u32 %r283, 31;
- sub.s32 %r284, %r283, %r282;
- shl.b32 %r16, %r276, %r284;
+ clz.b32 %r279, %r3;
+ mov.u32 %r280, 31;
+ sub.s32 %r281, %r280, %r279;
+ mov.u32 %r282, 1;
+ shl.b32 %r16, %r282, %r281;
setp.lt.u32 %p15, %r5, %r16;
- add.s32 %r285, %r16, %r5;
- setp.lt.u32 %p16, %r285, %r3;
+ add.s32 %r283, %r16, %r5;
+ setp.lt.u32 %p16, %r283, %r3;
and.pred %p2, %p15, %p16;
- add.s32 %r286, %r15, %r16;
- mul.wide.s32 %rd57, %r286, 4;
+ add.s32 %r284, %r15, %r16;
+ mul.wide.s32 %rd57, %r284, 4;
add.s64 %rd8, %rd45, %rd57;
- shr.u32 %r287, %r16, 31;
- add.s32 %r288, %r16, %r287;
- shr.s32 %r17, %r288, 1;
- add.s32 %r18, %r278, %r8;
+ shr.u32 %r285, %r16, 31;
+ add.s32 %r286, %r16, %r285;
+ shr.s32 %r17, %r286, 1;
+ shl.b32 %r287, %r9, 3;
+ mad.lo.s32 %r288, %r287, %r2, %r8;
add.s64 %rd58, %rd45, %rd51;
- mul.wide.s32 %rd59, %r18, 2;
+ mul.wide.s32 %rd59, %r288, 2;
add.s64 %rd9, %rd58, %rd59;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r8, 2;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r289, %r15, 1;
mul.wide.u32 %rd62, %r289, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd59;
- mul.wide.s32 %rd63, %r281, 4;
+ mul.wide.s32 %rd63, %r278, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd46, %rd51;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd35;
- mov.u32 %r559, 0;
+ mov.u32 %r553, 0;
mov.f32 %f187, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r292, smem_ptr; }
- add.s32 %r293, %r14, %r292;
+ add.s32 %r293, %r292, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r295, smem_ptr; }
- add.s32 %r296, %r14, %r295;
+ add.s32 %r296, %r295, %r14;
not.pred %p22, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
@@ -215,30 +215,30 @@
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r559, %r4;
- add.s32 %r290, %r23, %r9;
- add.s32 %r24, %r290, %r13;
- setp.gt.s32 %p17, %r24, 215;
+ mul.lo.s32 %r22, %r553, %r4;
+ add.s32 %r290, %r22, %r9;
+ add.s32 %r23, %r290, %r13;
+ setp.gt.s32 %p17, %r23, 215;
mov.f32 %f622, %f187;
@%p17 bra $L__BB0_9;
- mul.lo.s32 %r291, %r24, %r222;
+ mul.lo.s32 %r291, %r23, %r221;
mul.wide.s32 %rd65, %r291, 4;
add.s64 %rd66, %rd17, %rd65;
ld.global.f32 %f622, [%rd66];
$L__BB0_9:
- setp.lt.s32 %p18, %r24, 216;
+ setp.lt.s32 %p18, %r23, 216;
and.pred %p3, %p1, %p18;
not.pred %p19, %p3;
@%p19 bra $L__BB0_11;
- mul.lo.s32 %r557, %r559, %r4;
- mul.lo.s32 %r298, %r557, %r213;
+ mul.lo.s32 %r551, %r553, %r4;
+ mul.lo.s32 %r298, %r551, %r212;
cvt.s64.s32 %rd71, %r298;
add.s64 %rd72, %rd5, %rd71;
add.s64 %rd73, %rd72, %rd6;
shl.b64 %rd74, %rd73, 1;
add.s64 %rd68, %rd34, %rd74;
@@ -260,19 +260,19 @@
}
$L__BB0_11:
- mul.lo.s32 %r555, %r559, %r4;
- add.s32 %r554, %r555, %r9;
- add.s32 %r553, %r554, %r13;
- setp.gt.s32 %p199, %r553, 215;
+ mul.lo.s32 %r550, %r553, %r4;
+ add.s32 %r549, %r550, %r9;
+ add.s32 %r548, %r549, %r13;
+ setp.gt.s32 %p199, %r548, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p199 bra $L__BB0_13;
- mul.lo.s32 %r299, %r24, %r226;
+ mul.lo.s32 %r299, %r23, %r225;
mul.wide.s32 %rd75, %r299, 4;
add.s64 %rd76, %rd16, %rd75;
ld.global.f32 %f623, [%rd76];
$L__BB0_13:
@@ -465,11 +465,11 @@
mov.f32 %f641, %f640;
$L__BB0_16:
- shl.b32 %r562, %r562, 2;
+ shl.b32 %r556, %r556, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p22 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
@@ -480,29 +480,29 @@
$L__BB0_18:
setp.lt.s32 %p23, %r16, 4;
bar.sync 0;
@%p23 bra $L__BB0_23;
- mov.u32 %r560, %r17;
+ mov.u32 %r554, %r17;
$L__BB0_20:
- setp.ge.u32 %p24, %r5, %r560;
+ setp.ge.u32 %p24, %r5, %r554;
@%p24 bra $L__BB0_22;
- add.s32 %r328, %r560, %r15;
+ add.s32 %r328, %r554, %r15;
mul.wide.s32 %rd77, %r328, 4;
add.s64 %rd79, %rd45, %rd77;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd79];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
- shr.u32 %r36, %r560, 1;
- setp.gt.u32 %p25, %r560, 3;
- mov.u32 %r560, %r36;
+ shr.u32 %r35, %r554, 1;
+ setp.gt.u32 %p25, %r554, 3;
+ mov.u32 %r554, %r35;
@%p25 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p8 bra $L__BB0_26;
@@ -529,29 +529,29 @@
$L__BB0_28:
setp.lt.s32 %p198, %r16, 4;
bar.sync 0;
@%p198 bra $L__BB0_33;
- mov.u32 %r561, %r17;
+ mov.u32 %r555, %r17;
$L__BB0_30:
- setp.ge.u32 %p30, %r5, %r561;
+ setp.ge.u32 %p30, %r5, %r555;
@%p30 bra $L__BB0_32;
- add.s32 %r329, %r561, %r15;
+ add.s32 %r329, %r555, %r15;
mul.wide.s32 %rd80, %r329, 4;
add.s64 %rd82, %rd45, %rd80;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd82];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
- shr.u32 %r38, %r561, 1;
- setp.gt.u32 %p31, %r561, 3;
- mov.u32 %r561, %r38;
+ shr.u32 %r37, %r555, 1;
+ setp.gt.u32 %p31, %r555, 3;
+ mov.u32 %r555, %r37;
@%p31 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p8 bra $L__BB0_36;
@@ -589,11 +589,10 @@
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
- mul.lo.s32 %r556, %r559, %r4;
ld.shared.v4.u32 {%r338, %r339, %r340, %r341}, [%rd9];
ld.shared.v4.u32 {%r346, %r347, %r348, %r349}, [%rd10];
ld.shared.v4.u32 {%r354, %r355, %r356, %r357}, [%rd12];
mov.b32 {%rs97, %rs101}, %r346;
@@ -801,13 +800,12 @@
{ cvt.rn.f16.f32 %rs124, %f365;}
mov.b32 %r337, {%rs124, %rs128};
- add.s32 %r362, %r13, %r556;
- mad.lo.s32 %r363, %r362, %r213, %r18;
- mul.wide.s32 %rd84, %r363, 2;
+ mad.lo.s32 %r362, %r23, %r212, %r8;
+ mul.wide.s32 %rd84, %r362, 2;
add.s64 %rd83, %rd38, %rd84;
st.global.cs.v4.s32 [%rd83], {%r334,%r335,%r336,%r337};
bra.uni $L__BB0_43;
@@ -817,12 +815,12 @@
{ cvt.rn.f16.f32 %rs61, %f301;}
$L__BB0_43:
- add.s32 %r559, %r559, 1;
- setp.lt.s32 %p37, %r559, %r12;
+ add.s32 %r553, %r553, 1;
+ setp.lt.s32 %p37, %r553, %r12;
@%p37 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
@@ -841,68 +839,68 @@
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
- mov.u32 %r364, %tid.z;
- mad.lo.s32 %r365, %r4, %r364, %r9;
- mad.lo.s32 %r50, %r365, %r3, %r5;
- mul.wide.u32 %rd85, %r50, 4;
+ mov.u32 %r363, %tid.z;
+ mad.lo.s32 %r364, %r4, %r363, %r9;
+ mad.lo.s32 %r49, %r364, %r3, %r5;
+ mul.wide.u32 %rd85, %r49, 4;
add.s64 %rd23, %rd45, %rd85;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
- clz.b32 %r366, %r4;
- mov.u32 %r367, 31;
- sub.s32 %r51, %r367, %r366;
- mov.u32 %r368, 1;
- shl.b32 %r593, %r368, %r51;
- setp.lt.u32 %p38, %r9, %r593;
- add.s32 %r369, %r593, %r9;
- setp.lt.u32 %p39, %r369, %r4;
+ clz.b32 %r365, %r4;
+ mov.u32 %r366, 31;
+ sub.s32 %r50, %r366, %r365;
+ mov.u32 %r367, 1;
+ shl.b32 %r587, %r367, %r50;
+ setp.lt.u32 %p38, %r9, %r587;
+ add.s32 %r368, %r587, %r9;
+ setp.lt.u32 %p39, %r368, %r4;
and.pred %p4, %p38, %p39;
not.pred %p40, %p4;
@%p40 bra $L__BB0_46;
- shl.b32 %r370, %r3, %r51;
- add.s32 %r371, %r50, %r370;
- mul.wide.s32 %rd87, %r371, 4;
+ shl.b32 %r369, %r3, %r50;
+ add.s32 %r370, %r49, %r369;
+ mul.wide.s32 %rd87, %r370, 4;
add.s64 %rd89, %rd45, %rd87;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd89];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
- setp.lt.s32 %p41, %r593, 4;
+ setp.lt.s32 %p41, %r587, 4;
@%p41 bra $L__BB0_51;
- mov.u32 %r563, %r593;
+ mov.u32 %r557, %r587;
$L__BB0_48:
- shr.u32 %r54, %r563, 1;
- setp.ge.u32 %p42, %r9, %r54;
+ shr.u32 %r53, %r557, 1;
+ setp.ge.u32 %p42, %r9, %r53;
@%p42 bra $L__BB0_50;
- mad.lo.s32 %r372, %r54, %r3, %r50;
- mul.wide.s32 %rd90, %r372, 4;
+ mad.lo.s32 %r371, %r53, %r3, %r49;
+ mul.wide.s32 %rd90, %r371, 4;
add.s64 %rd92, %rd45, %rd90;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd92];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
- setp.gt.u32 %p43, %r563, 7;
- mov.u32 %r563, %r54;
+ setp.gt.u32 %p43, %r557, 7;
+ mov.u32 %r557, %r53;
@%p43 bra $L__BB0_48;
$L__BB0_51:
- mov.u32 %r564, 0;
- add.s32 %r374, %r50, %r3;
- mul.wide.u32 %rd93, %r374, 4;
+ mov.u32 %r558, 0;
+ add.s32 %r373, %r49, %r3;
+ mul.wide.u32 %rd93, %r373, 4;
add.s64 %rd24, %rd45, %rd93;
@%p12 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
@@ -911,54 +909,54 @@
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
- mov.b32 %r564, %f660;
+ mov.b32 %r558, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p40 bra $L__BB0_57;
- shl.b32 %r375, %r3, %r51;
- add.s32 %r376, %r50, %r375;
- mul.wide.s32 %rd95, %r376, 4;
+ shl.b32 %r374, %r3, %r50;
+ add.s32 %r375, %r49, %r374;
+ mul.wide.s32 %rd95, %r375, 4;
add.s64 %rd97, %rd45, %rd95;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd97];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p41 bra $L__BB0_62;
- mov.u32 %r565, %r593;
+ mov.u32 %r559, %r587;
$L__BB0_59:
- shr.u32 %r58, %r565, 1;
- setp.ge.u32 %p48, %r9, %r58;
+ shr.u32 %r57, %r559, 1;
+ setp.ge.u32 %p48, %r9, %r57;
@%p48 bra $L__BB0_61;
- mad.lo.s32 %r377, %r58, %r3, %r50;
- mul.wide.s32 %rd98, %r377, 4;
+ mad.lo.s32 %r376, %r57, %r3, %r49;
+ mul.wide.s32 %rd98, %r376, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd100];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
- setp.gt.u32 %p49, %r565, 7;
- mov.u32 %r565, %r58;
+ setp.gt.u32 %p49, %r559, 7;
+ mov.u32 %r559, %r57;
@%p49 bra $L__BB0_59;
$L__BB0_62:
- mov.u32 %r566, 0;
+ mov.u32 %r560, 0;
@%p12 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p51, %r4, 2;
@@ -966,54 +964,54 @@
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
- mov.b32 %r566, %f661;
+ mov.b32 %r560, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p40 bra $L__BB0_68;
- shl.b32 %r379, %r3, %r51;
- add.s32 %r380, %r50, %r379;
- mul.wide.s32 %rd101, %r380, 4;
+ shl.b32 %r378, %r3, %r50;
+ add.s32 %r379, %r49, %r378;
+ mul.wide.s32 %rd101, %r379, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd103];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p41 bra $L__BB0_73;
- mov.u32 %r567, %r593;
+ mov.u32 %r561, %r587;
$L__BB0_70:
- shr.u32 %r62, %r567, 1;
- setp.ge.u32 %p54, %r9, %r62;
+ shr.u32 %r61, %r561, 1;
+ setp.ge.u32 %p54, %r9, %r61;
@%p54 bra $L__BB0_72;
- mad.lo.s32 %r381, %r62, %r3, %r50;
- mul.wide.s32 %rd104, %r381, 4;
+ mad.lo.s32 %r380, %r61, %r3, %r49;
+ mul.wide.s32 %rd104, %r380, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd106];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
- setp.gt.u32 %p55, %r567, 7;
- mov.u32 %r567, %r62;
+ setp.gt.u32 %p55, %r561, 7;
+ mov.u32 %r561, %r61;
@%p55 bra $L__BB0_70;
$L__BB0_73:
- mov.u32 %r568, 0;
+ mov.u32 %r562, 0;
@%p12 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@@ -1021,54 +1019,54 @@
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
- mov.b32 %r568, %f662;
+ mov.b32 %r562, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p40 bra $L__BB0_79;
- shl.b32 %r383, %r3, %r51;
- add.s32 %r384, %r50, %r383;
- mul.wide.s32 %rd107, %r384, 4;
+ shl.b32 %r382, %r3, %r50;
+ add.s32 %r383, %r49, %r382;
+ mul.wide.s32 %rd107, %r383, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd109];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p41 bra $L__BB0_84;
- mov.u32 %r569, %r593;
+ mov.u32 %r563, %r587;
$L__BB0_81:
- shr.u32 %r66, %r569, 1;
- setp.ge.u32 %p60, %r9, %r66;
+ shr.u32 %r65, %r563, 1;
+ setp.ge.u32 %p60, %r9, %r65;
@%p60 bra $L__BB0_83;
- mad.lo.s32 %r385, %r66, %r3, %r50;
- mul.wide.s32 %rd110, %r385, 4;
+ mad.lo.s32 %r384, %r65, %r3, %r49;
+ mul.wide.s32 %rd110, %r384, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd112];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
- setp.gt.u32 %p61, %r569, 7;
- mov.u32 %r569, %r66;
+ setp.gt.u32 %p61, %r563, 7;
+ mov.u32 %r563, %r65;
@%p61 bra $L__BB0_81;
$L__BB0_84:
- mov.u32 %r570, 0;
+ mov.u32 %r564, 0;
@%p12 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@@ -1076,54 +1074,54 @@
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
- mov.b32 %r570, %f663;
+ mov.b32 %r564, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p40 bra $L__BB0_90;
- shl.b32 %r387, %r3, %r51;
- add.s32 %r388, %r50, %r387;
- mul.wide.s32 %rd113, %r388, 4;
+ shl.b32 %r386, %r3, %r50;
+ add.s32 %r387, %r49, %r386;
+ mul.wide.s32 %rd113, %r387, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd115];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p41 bra $L__BB0_95;
- mov.u32 %r571, %r593;
+ mov.u32 %r565, %r587;
$L__BB0_92:
- shr.u32 %r70, %r571, 1;
- setp.ge.u32 %p66, %r9, %r70;
+ shr.u32 %r69, %r565, 1;
+ setp.ge.u32 %p66, %r9, %r69;
@%p66 bra $L__BB0_94;
- mad.lo.s32 %r389, %r70, %r3, %r50;
- mul.wide.s32 %rd116, %r389, 4;
+ mad.lo.s32 %r388, %r69, %r3, %r49;
+ mul.wide.s32 %rd116, %r388, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd118];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
- setp.gt.u32 %p67, %r571, 7;
- mov.u32 %r571, %r70;
+ setp.gt.u32 %p67, %r565, 7;
+ mov.u32 %r565, %r69;
@%p67 bra $L__BB0_92;
$L__BB0_95:
- mov.u32 %r572, 0;
+ mov.u32 %r566, 0;
@%p12 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@@ -1131,54 +1129,54 @@
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
- mov.b32 %r572, %f664;
+ mov.b32 %r566, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p40 bra $L__BB0_101;
- shl.b32 %r391, %r3, %r51;
- add.s32 %r392, %r50, %r391;
- mul.wide.s32 %rd119, %r392, 4;
+ shl.b32 %r390, %r3, %r50;
+ add.s32 %r391, %r49, %r390;
+ mul.wide.s32 %rd119, %r391, 4;
add.s64 %rd121, %rd45, %rd119;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd121];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p41 bra $L__BB0_106;
- mov.u32 %r573, %r593;
+ mov.u32 %r567, %r587;
$L__BB0_103:
- shr.u32 %r74, %r573, 1;
- setp.ge.u32 %p72, %r9, %r74;
+ shr.u32 %r73, %r567, 1;
+ setp.ge.u32 %p72, %r9, %r73;
@%p72 bra $L__BB0_105;
- mad.lo.s32 %r393, %r74, %r3, %r50;
- mul.wide.s32 %rd122, %r393, 4;
+ mad.lo.s32 %r392, %r73, %r3, %r49;
+ mul.wide.s32 %rd122, %r392, 4;
add.s64 %rd124, %rd45, %rd122;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd124];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
- setp.gt.u32 %p73, %r573, 7;
- mov.u32 %r573, %r74;
+ setp.gt.u32 %p73, %r567, 7;
+ mov.u32 %r567, %r73;
@%p73 bra $L__BB0_103;
$L__BB0_106:
- mov.u32 %r574, 0;
+ mov.u32 %r568, 0;
@%p12 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@@ -1186,54 +1184,54 @@
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
- mov.b32 %r574, %f665;
+ mov.b32 %r568, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p40 bra $L__BB0_112;
- shl.b32 %r395, %r3, %r51;
- add.s32 %r396, %r50, %r395;
- mul.wide.s32 %rd125, %r396, 4;
+ shl.b32 %r394, %r3, %r50;
+ add.s32 %r395, %r49, %r394;
+ mul.wide.s32 %rd125, %r395, 4;
add.s64 %rd127, %rd45, %rd125;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd127];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p41 bra $L__BB0_117;
- mov.u32 %r575, %r593;
+ mov.u32 %r569, %r587;
$L__BB0_114:
- shr.u32 %r78, %r575, 1;
- setp.ge.u32 %p78, %r9, %r78;
+ shr.u32 %r77, %r569, 1;
+ setp.ge.u32 %p78, %r9, %r77;
@%p78 bra $L__BB0_116;
- mad.lo.s32 %r397, %r78, %r3, %r50;
- mul.wide.s32 %rd128, %r397, 4;
+ mad.lo.s32 %r396, %r77, %r3, %r49;
+ mul.wide.s32 %rd128, %r396, 4;
add.s64 %rd130, %rd45, %rd128;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd130];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
- setp.gt.u32 %p79, %r575, 7;
- mov.u32 %r575, %r78;
+ setp.gt.u32 %p79, %r569, 7;
+ mov.u32 %r569, %r77;
@%p79 bra $L__BB0_114;
$L__BB0_117:
- mov.u32 %r576, 0;
+ mov.u32 %r570, 0;
@%p12 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@@ -1241,54 +1239,54 @@
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
- mov.b32 %r576, %f666;
+ mov.b32 %r570, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p40 bra $L__BB0_123;
- shl.b32 %r399, %r3, %r51;
- add.s32 %r400, %r50, %r399;
- mul.wide.s32 %rd131, %r400, 4;
+ shl.b32 %r398, %r3, %r50;
+ add.s32 %r399, %r49, %r398;
+ mul.wide.s32 %rd131, %r399, 4;
add.s64 %rd133, %rd45, %rd131;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd133];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p41 bra $L__BB0_128;
- mov.u32 %r577, %r593;
+ mov.u32 %r571, %r587;
$L__BB0_125:
- shr.u32 %r82, %r577, 1;
- setp.ge.u32 %p84, %r9, %r82;
+ shr.u32 %r81, %r571, 1;
+ setp.ge.u32 %p84, %r9, %r81;
@%p84 bra $L__BB0_127;
- mad.lo.s32 %r401, %r82, %r3, %r50;
- mul.wide.s32 %rd134, %r401, 4;
+ mad.lo.s32 %r400, %r81, %r3, %r49;
+ mul.wide.s32 %rd134, %r400, 4;
add.s64 %rd136, %rd45, %rd134;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd136];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
- setp.gt.u32 %p85, %r577, 7;
- mov.u32 %r577, %r82;
+ setp.gt.u32 %p85, %r571, 7;
+ mov.u32 %r571, %r81;
@%p85 bra $L__BB0_125;
$L__BB0_128:
- mov.u32 %r578, 0;
+ mov.u32 %r572, 0;
@%p12 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@@ -1296,55 +1294,55 @@
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
- mov.b32 %r578, %f667;
+ mov.b32 %r572, %f667;
$L__BB0_132:
bar.sync 0;
- shl.b32 %r85, %r562, 4;
+ shl.b32 %r84, %r556, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p40 bra $L__BB0_134;
- shl.b32 %r403, %r3, %r51;
- add.s32 %r404, %r50, %r403;
- mul.wide.s32 %rd137, %r404, 4;
+ shl.b32 %r402, %r3, %r50;
+ add.s32 %r403, %r49, %r402;
+ mul.wide.s32 %rd137, %r403, 4;
add.s64 %rd139, %rd45, %rd137;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd139];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p41 bra $L__BB0_139;
- mov.u32 %r579, %r593;
+ mov.u32 %r573, %r587;
$L__BB0_136:
- shr.u32 %r87, %r579, 1;
- setp.ge.u32 %p90, %r9, %r87;
+ shr.u32 %r86, %r573, 1;
+ setp.ge.u32 %p90, %r9, %r86;
@%p90 bra $L__BB0_138;
- mad.lo.s32 %r405, %r87, %r3, %r50;
- mul.wide.s32 %rd140, %r405, 4;
+ mad.lo.s32 %r404, %r86, %r3, %r49;
+ mul.wide.s32 %rd140, %r404, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd142];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
- setp.gt.u32 %p91, %r579, 7;
- mov.u32 %r579, %r87;
+ setp.gt.u32 %p91, %r573, 7;
+ mov.u32 %r573, %r86;
@%p91 bra $L__BB0_136;
$L__BB0_139:
- mov.u32 %r580, 0;
+ mov.u32 %r574, 0;
@%p12 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@@ -1352,54 +1350,54 @@
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
- mov.b32 %r580, %f668;
+ mov.b32 %r574, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p40 bra $L__BB0_145;
- shl.b32 %r407, %r3, %r51;
- add.s32 %r408, %r50, %r407;
- mul.wide.s32 %rd143, %r408, 4;
+ shl.b32 %r406, %r3, %r50;
+ add.s32 %r407, %r49, %r406;
+ mul.wide.s32 %rd143, %r407, 4;
add.s64 %rd145, %rd45, %rd143;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd145];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p41 bra $L__BB0_150;
- mov.u32 %r581, %r593;
+ mov.u32 %r575, %r587;
$L__BB0_147:
- shr.u32 %r91, %r581, 1;
- setp.ge.u32 %p96, %r9, %r91;
+ shr.u32 %r90, %r575, 1;
+ setp.ge.u32 %p96, %r9, %r90;
@%p96 bra $L__BB0_149;
- mad.lo.s32 %r409, %r91, %r3, %r50;
- mul.wide.s32 %rd146, %r409, 4;
+ mad.lo.s32 %r408, %r90, %r3, %r49;
+ mul.wide.s32 %rd146, %r408, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd148];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
- setp.gt.u32 %p97, %r581, 7;
- mov.u32 %r581, %r91;
+ setp.gt.u32 %p97, %r575, 7;
+ mov.u32 %r575, %r90;
@%p97 bra $L__BB0_147;
$L__BB0_150:
- mov.u32 %r582, 0;
+ mov.u32 %r576, 0;
@%p12 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@@ -1407,54 +1405,54 @@
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
- mov.b32 %r582, %f669;
+ mov.b32 %r576, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p40 bra $L__BB0_156;
- shl.b32 %r411, %r3, %r51;
- add.s32 %r412, %r50, %r411;
- mul.wide.s32 %rd149, %r412, 4;
+ shl.b32 %r410, %r3, %r50;
+ add.s32 %r411, %r49, %r410;
+ mul.wide.s32 %rd149, %r411, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd151];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p41 bra $L__BB0_161;
- mov.u32 %r583, %r593;
+ mov.u32 %r577, %r587;
$L__BB0_158:
- shr.u32 %r95, %r583, 1;
- setp.ge.u32 %p102, %r9, %r95;
+ shr.u32 %r94, %r577, 1;
+ setp.ge.u32 %p102, %r9, %r94;
@%p102 bra $L__BB0_160;
- mad.lo.s32 %r413, %r95, %r3, %r50;
- mul.wide.s32 %rd152, %r413, 4;
+ mad.lo.s32 %r412, %r94, %r3, %r49;
+ mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd154];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
- setp.gt.u32 %p103, %r583, 7;
- mov.u32 %r583, %r95;
+ setp.gt.u32 %p103, %r577, 7;
+ mov.u32 %r577, %r94;
@%p103 bra $L__BB0_158;
$L__BB0_161:
- mov.u32 %r584, 0;
+ mov.u32 %r578, 0;
@%p12 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@@ -1462,54 +1460,54 @@
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
- mov.b32 %r584, %f670;
+ mov.b32 %r578, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p40 bra $L__BB0_167;
- shl.b32 %r415, %r3, %r51;
- add.s32 %r416, %r50, %r415;
- mul.wide.s32 %rd155, %r416, 4;
+ shl.b32 %r414, %r3, %r50;
+ add.s32 %r415, %r49, %r414;
+ mul.wide.s32 %rd155, %r415, 4;
add.s64 %rd157, %rd45, %rd155;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd157];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p41 bra $L__BB0_172;
- mov.u32 %r585, %r593;
+ mov.u32 %r579, %r587;
$L__BB0_169:
- shr.u32 %r99, %r585, 1;
- setp.ge.u32 %p108, %r9, %r99;
+ shr.u32 %r98, %r579, 1;
+ setp.ge.u32 %p108, %r9, %r98;
@%p108 bra $L__BB0_171;
- mad.lo.s32 %r417, %r99, %r3, %r50;
- mul.wide.s32 %rd158, %r417, 4;
+ mad.lo.s32 %r416, %r98, %r3, %r49;
+ mul.wide.s32 %rd158, %r416, 4;
add.s64 %rd160, %rd45, %rd158;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd160];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
- setp.gt.u32 %p109, %r585, 7;
- mov.u32 %r585, %r99;
+ setp.gt.u32 %p109, %r579, 7;
+ mov.u32 %r579, %r98;
@%p109 bra $L__BB0_169;
$L__BB0_172:
- mov.u32 %r586, 0;
+ mov.u32 %r580, 0;
@%p12 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@@ -1517,54 +1515,54 @@
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
- mov.b32 %r586, %f671;
+ mov.b32 %r580, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p40 bra $L__BB0_178;
- shl.b32 %r419, %r3, %r51;
- add.s32 %r420, %r50, %r419;
- mul.wide.s32 %rd161, %r420, 4;
+ shl.b32 %r418, %r3, %r50;
+ add.s32 %r419, %r49, %r418;
+ mul.wide.s32 %rd161, %r419, 4;
add.s64 %rd163, %rd45, %rd161;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd163];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p41 bra $L__BB0_183;
- mov.u32 %r587, %r593;
+ mov.u32 %r581, %r587;
$L__BB0_180:
- shr.u32 %r103, %r587, 1;
- setp.ge.u32 %p114, %r9, %r103;
+ shr.u32 %r102, %r581, 1;
+ setp.ge.u32 %p114, %r9, %r102;
@%p114 bra $L__BB0_182;
- mad.lo.s32 %r421, %r103, %r3, %r50;
- mul.wide.s32 %rd164, %r421, 4;
+ mad.lo.s32 %r420, %r102, %r3, %r49;
+ mul.wide.s32 %rd164, %r420, 4;
add.s64 %rd166, %rd45, %rd164;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd166];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
- setp.gt.u32 %p115, %r587, 7;
- mov.u32 %r587, %r103;
+ setp.gt.u32 %p115, %r581, 7;
+ mov.u32 %r581, %r102;
@%p115 bra $L__BB0_180;
$L__BB0_183:
- mov.u32 %r588, 0;
+ mov.u32 %r582, 0;
@%p12 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@@ -1572,54 +1570,54 @@
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
- mov.b32 %r588, %f672;
+ mov.b32 %r582, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p40 bra $L__BB0_189;
- shl.b32 %r423, %r3, %r51;
- add.s32 %r424, %r50, %r423;
- mul.wide.s32 %rd167, %r424, 4;
+ shl.b32 %r422, %r3, %r50;
+ add.s32 %r423, %r49, %r422;
+ mul.wide.s32 %rd167, %r423, 4;
add.s64 %rd169, %rd45, %rd167;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd169];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p41 bra $L__BB0_194;
- mov.u32 %r589, %r593;
+ mov.u32 %r583, %r587;
$L__BB0_191:
- shr.u32 %r107, %r589, 1;
- setp.ge.u32 %p120, %r9, %r107;
+ shr.u32 %r106, %r583, 1;
+ setp.ge.u32 %p120, %r9, %r106;
@%p120 bra $L__BB0_193;
- mad.lo.s32 %r425, %r107, %r3, %r50;
- mul.wide.s32 %rd170, %r425, 4;
+ mad.lo.s32 %r424, %r106, %r3, %r49;
+ mul.wide.s32 %rd170, %r424, 4;
add.s64 %rd172, %rd45, %rd170;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd172];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
- setp.gt.u32 %p121, %r589, 7;
- mov.u32 %r589, %r107;
+ setp.gt.u32 %p121, %r583, 7;
+ mov.u32 %r583, %r106;
@%p121 bra $L__BB0_191;
$L__BB0_194:
- mov.u32 %r590, 0;
+ mov.u32 %r584, 0;
@%p12 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@@ -1627,54 +1625,54 @@
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
- mov.b32 %r590, %f673;
+ mov.b32 %r584, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p40 bra $L__BB0_200;
- shl.b32 %r427, %r3, %r51;
- add.s32 %r428, %r50, %r427;
- mul.wide.s32 %rd173, %r428, 4;
+ shl.b32 %r426, %r3, %r50;
+ add.s32 %r427, %r49, %r426;
+ mul.wide.s32 %rd173, %r427, 4;
add.s64 %rd175, %rd45, %rd173;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd175];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p41 bra $L__BB0_205;
- mov.u32 %r591, %r593;
+ mov.u32 %r585, %r587;
$L__BB0_202:
- shr.u32 %r111, %r591, 1;
- setp.ge.u32 %p126, %r9, %r111;
+ shr.u32 %r110, %r585, 1;
+ setp.ge.u32 %p126, %r9, %r110;
@%p126 bra $L__BB0_204;
- mad.lo.s32 %r429, %r111, %r3, %r50;
- mul.wide.s32 %rd176, %r429, 4;
+ mad.lo.s32 %r428, %r110, %r3, %r49;
+ mul.wide.s32 %rd176, %r428, 4;
add.s64 %rd178, %rd45, %rd176;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd178];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
- setp.gt.u32 %p127, %r591, 7;
- mov.u32 %r591, %r111;
+ setp.gt.u32 %p127, %r585, 7;
+ mov.u32 %r585, %r110;
@%p127 bra $L__BB0_202;
$L__BB0_205:
- mov.u32 %r592, 0;
+ mov.u32 %r586, 0;
@%p12 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@@ -1682,21 +1680,21 @@
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
- mov.b32 %r592, %f674;
+ mov.b32 %r586, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p40 bra $L__BB0_211;
- shl.b32 %r431, %r3, %r51;
- add.s32 %r432, %r50, %r431;
- mul.wide.s32 %rd179, %r432, 4;
+ shl.b32 %r430, %r3, %r50;
+ add.s32 %r431, %r49, %r430;
+ mul.wide.s32 %rd179, %r431, 4;
add.s64 %rd181, %rd45, %rd179;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd181];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
@@ -1704,30 +1702,30 @@
$L__BB0_211:
bar.sync 0;
@%p41 bra $L__BB0_215;
$L__BB0_212:
- shr.u32 %r115, %r593, 1;
- setp.ge.u32 %p132, %r9, %r115;
+ shr.u32 %r114, %r587, 1;
+ setp.ge.u32 %p132, %r9, %r114;
@%p132 bra $L__BB0_214;
- mad.lo.s32 %r433, %r115, %r3, %r50;
- mul.wide.s32 %rd182, %r433, 4;
+ mad.lo.s32 %r432, %r114, %r3, %r49;
+ mul.wide.s32 %rd182, %r432, 4;
add.s64 %rd184, %rd45, %rd182;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd184];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
- setp.gt.u32 %p133, %r593, 7;
- mov.u32 %r593, %r115;
+ setp.gt.u32 %p133, %r587, 7;
+ mov.u32 %r587, %r114;
@%p133 bra $L__BB0_212;
$L__BB0_215:
- mov.u32 %r594, 0;
+ mov.u32 %r588, 0;
@%p12 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@@ -1735,275 +1733,271 @@
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
- mov.b32 %r594, %f675;
+ mov.b32 %r588, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p12 bra $L__BB0_226;
- shl.b32 %r552, %r5, 3;
- mov.u32 %r459, %ctaid.y;
- mad.lo.s32 %r460, %r213, %r459, %r552;
- add.s32 %r461, %r460, %r85;
- mul.wide.s32 %rd191, %r461, 4;
+ mov.u32 %r458, %ctaid.y;
+ mad.lo.s32 %r459, %r212, %r458, %r8;
+ add.s32 %r460, %r459, %r84;
+ mul.wide.s32 %rd191, %r460, 4;
add.s64 %rd189, %rd41, %rd191;
- st.volatile.global.v4.s32 [%rd189], {%r564,%r566,%r568,%r570};
-
- add.s32 %r462, %r461, 4;
- mul.wide.s32 %rd192, %r462, 4;
+ st.volatile.global.v4.s32 [%rd189], {%r558,%r560,%r562,%r564};
+
+ add.s32 %r461, %r460, 4;
+ mul.wide.s32 %rd192, %r461, 4;
add.s64 %rd190, %rd41, %rd192;
- st.volatile.global.v4.s32 [%rd190], {%r572,%r574,%r576,%r578};
+ st.volatile.global.v4.s32 [%rd190], {%r566,%r568,%r570,%r572};
bra.uni $L__BB0_226;
$L__BB0_220:
- shl.b32 %r549, %r5, 3;
setp.eq.s32 %p136, %r9, 0;
and.pred %p5, %p136, %p10;
not.pred %p138, %p5;
- add.s32 %r435, %r549, 3;
- sub.s32 %r118, %r435, %r213;
- mov.u32 %r436, %ctaid.y;
- mad.lo.s32 %r119, %r213, %r436, %r549;
- neg.s32 %r437, %r85;
- setp.ge.s32 %p139, %r118, %r437;
+ add.s32 %r434, %r8, 3;
+ sub.s32 %r117, %r434, %r212;
+ mov.u32 %r435, %ctaid.y;
+ mad.lo.s32 %r118, %r212, %r435, %r8;
+ neg.s32 %r436, %r84;
+ setp.ge.s32 %p139, %r117, %r436;
or.pred %p140, %p138, %p139;
@%p140 bra $L__BB0_222;
- add.s32 %r442, %r119, %r85;
- mul.wide.s32 %rd186, %r442, 4;
+ add.s32 %r441, %r118, %r84;
+ mul.wide.s32 %rd186, %r441, 4;
add.s64 %rd185, %rd41, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r564,%r566,%r568,%r570};
+ st.volatile.global.v4.s32 [%rd185], {%r558,%r560,%r562,%r564};
$L__BB0_222:
- mov.u32 %r443, -4;
- sub.s32 %r444, %r443, %r85;
- setp.ge.s32 %p141, %r118, %r444;
+ mov.u32 %r442, -4;
+ sub.s32 %r443, %r442, %r84;
+ setp.ge.s32 %p141, %r117, %r443;
or.pred %p143, %p138, %p141;
@%p143 bra $L__BB0_226;
- add.s32 %r449, %r119, %r85;
- add.s32 %r450, %r449, 4;
- mul.wide.s32 %rd188, %r450, 4;
+ add.s32 %r448, %r118, %r84;
+ add.s32 %r449, %r448, 4;
+ mul.wide.s32 %rd188, %r449, 4;
add.s64 %rd187, %rd41, %rd188;
- st.volatile.global.v4.s32 [%rd187], {%r572,%r574,%r576,%r578};
+ st.volatile.global.v4.s32 [%rd187], {%r566,%r568,%r570,%r572};
$L__BB0_226:
@%p1 bra $L__BB0_233;
bra.uni $L__BB0_227;
$L__BB0_233:
@%p12 bra $L__BB0_235;
- shl.b32 %r551, %r5, 3;
- shl.b32 %r487, %r562, 5;
- mov.u32 %r488, %ctaid.y;
- mad.lo.s32 %r489, %r213, %r488, %r551;
- add.s32 %r490, %r489, %r487;
- mul.wide.s32 %rd199, %r490, 4;
+ shl.b32 %r486, %r556, 5;
+ mov.u32 %r487, %ctaid.y;
+ mad.lo.s32 %r488, %r212, %r487, %r8;
+ add.s32 %r489, %r488, %r486;
+ mul.wide.s32 %rd199, %r489, 4;
add.s64 %rd197, %rd42, %rd199;
- st.volatile.global.v4.s32 [%rd197], {%r580,%r582,%r584,%r586};
-
- add.s32 %r491, %r490, 4;
- mul.wide.s32 %rd200, %r491, 4;
+ st.volatile.global.v4.s32 [%rd197], {%r574,%r576,%r578,%r580};
+
+ add.s32 %r490, %r489, 4;
+ mul.wide.s32 %rd200, %r490, 4;
add.s64 %rd198, %rd42, %rd200;
- st.volatile.global.v4.s32 [%rd198], {%r588,%r590,%r592,%r594};
+ st.volatile.global.v4.s32 [%rd198], {%r582,%r584,%r586,%r588};
bra.uni $L__BB0_235;
$L__BB0_227:
- shl.b32 %r550, %r5, 3;
setp.eq.s32 %p145, %r9, 0;
and.pred %p6, %p145, %p10;
- add.s32 %r463, %r550, 3;
- sub.s32 %r120, %r463, %r213;
- mov.u32 %r464, %ctaid.y;
- mad.lo.s32 %r121, %r213, %r464, %r550;
+ add.s32 %r462, %r8, 3;
+ sub.s32 %r119, %r462, %r212;
+ mov.u32 %r463, %ctaid.y;
+ mad.lo.s32 %r120, %r212, %r463, %r8;
not.pred %p147, %p6;
@%p147 bra $L__BB0_230;
- shl.b32 %r122, %r562, 5;
- neg.s32 %r465, %r122;
- setp.ge.s32 %p148, %r120, %r465;
+ shl.b32 %r121, %r556, 5;
+ neg.s32 %r464, %r121;
+ setp.ge.s32 %p148, %r119, %r464;
@%p148 bra $L__BB0_230;
- add.s32 %r470, %r121, %r122;
- mul.wide.s32 %rd194, %r470, 4;
+ add.s32 %r469, %r120, %r121;
+ mul.wide.s32 %rd194, %r469, 4;
add.s64 %rd193, %rd42, %rd194;
- st.volatile.global.v4.s32 [%rd193], {%r580,%r582,%r584,%r586};
+ st.volatile.global.v4.s32 [%rd193], {%r574,%r576,%r578,%r580};
$L__BB0_230:
@%p147 bra $L__BB0_235;
- shl.b32 %r123, %r562, 5;
- mov.u32 %r471, -4;
- sub.s32 %r472, %r471, %r123;
- setp.ge.s32 %p150, %r120, %r472;
+ shl.b32 %r122, %r556, 5;
+ mov.u32 %r470, -4;
+ sub.s32 %r471, %r470, %r122;
+ setp.ge.s32 %p150, %r119, %r471;
@%p150 bra $L__BB0_235;
- add.s32 %r477, %r121, %r123;
- add.s32 %r478, %r477, 4;
- mul.wide.s32 %rd196, %r478, 4;
+ add.s32 %r476, %r120, %r122;
+ add.s32 %r477, %r476, 4;
+ mul.wide.s32 %rd196, %r477, 4;
add.s64 %rd195, %rd42, %rd196;
- st.volatile.global.v4.s32 [%rd195], {%r588,%r590,%r592,%r594};
+ st.volatile.global.v4.s32 [%rd195], {%r582,%r584,%r586,%r588};
$L__BB0_235:
- mov.u32 %r124, %ctaid.y;
+ mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r492, %r5, %r9;
- or.b32 %r494, %r492, %r364;
- setp.ne.s32 %p152, %r494, 0;
+ or.b32 %r491, %r5, %r9;
+ or.b32 %r493, %r491, %r363;
+ setp.ne.s32 %p152, %r493, 0;
@%p152 bra $L__BB0_239;
ld.param.u64 %rd236, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd201, %rd236;
- mov.u32 %r495, %ctaid.x;
- mov.u32 %r496, %ctaid.z;
- mov.u32 %r497, %nctaid.x;
- mad.lo.s32 %r498, %r496, %r497, %r495;
- mul.wide.s32 %rd202, %r498, 8;
+ mov.u32 %r494, %ctaid.x;
+ mov.u32 %r495, %ctaid.z;
+ mov.u32 %r496, %nctaid.x;
+ mad.lo.s32 %r497, %r495, %r496, %r494;
+ mul.wide.s32 %rd202, %r497, 8;
add.s64 %rd27, %rd201, %rd202;
- add.s32 %r499, %r11, -1;
- setp.eq.s32 %p153, %r124, %r499;
+ add.s32 %r498, %r11, -1;
+ setp.eq.s32 %p153, %r123, %r498;
cvt.s64.s32 %rd203, %r11;
mov.u64 %rd204, -9223372036854775807;
sub.s64 %rd205, %rd204, %rd203;
selp.b64 %rd206, %rd205, 1, %p153;
atom.global.add.u64 %rd28, [%rd27], %rd206;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.lt.s64 %p154, %rd208, 0;
@%p154 bra $L__BB0_239;
- mov.u32 %r595, 8;
+ mov.u32 %r589, 8;
$L__BB0_238:
- nanosleep.u32 %r595;
-
- setp.lt.u32 %p155, %r595, 256;
- selp.u32 %r502, 1, 0, %p155;
- shl.b32 %r595, %r595, %r502;
+ nanosleep.u32 %r589;
+
+ setp.lt.u32 %p155, %r589, 256;
+ selp.u32 %r501, 1, 0, %p155;
+ shl.b32 %r589, %r589, %r501;
ld.volatile.global.u64 %rd209, [%rd27];
xor.b64 %rd210, %rd209, %rd28;
setp.gt.s64 %p156, %rd210, -1;
@%p156 bra $L__BB0_238;
$L__BB0_239:
ld.param.u64 %rd235, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd234, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
- mov.u32 %r504, 1;
- add.s32 %r505, %r213, 1;
- shr.u32 %r506, %r505, 31;
- add.s32 %r507, %r505, %r506;
- shr.s32 %r508, %r507, 1;
- add.s32 %r509, %r4, %r508;
- add.s32 %r510, %r509, -1;
- div.s32 %r511, %r510, %r4;
- add.s32 %r512, %r11, -1;
- add.s32 %r513, %r512, %r511;
- div.s32 %r127, %r513, %r11;
- add.s32 %r128, %r512, %r3;
- shl.b32 %r129, %r9, 1;
- shl.b32 %r514, %r4, 1;
- mad.lo.s32 %r132, %r514, %r124, %r129;
- or.b32 %r130, %r132, 1;
- mul.lo.s32 %r131, %r514, %r11;
- clz.b32 %r515, %r3;
- mov.u32 %r516, 31;
- sub.s32 %r517, %r516, %r515;
- shl.b32 %r133, %r504, %r517;
- setp.lt.u32 %p157, %r5, %r133;
- add.s32 %r518, %r133, %r5;
- setp.lt.u32 %p158, %r518, %r3;
+ mov.u32 %r503, 1;
+ add.s32 %r504, %r212, 1;
+ shr.u32 %r505, %r504, 31;
+ add.s32 %r506, %r504, %r505;
+ shr.s32 %r507, %r506, 1;
+ add.s32 %r508, %r4, %r507;
+ add.s32 %r509, %r508, -1;
+ div.s32 %r510, %r509, %r4;
+ add.s32 %r511, %r11, -1;
+ add.s32 %r512, %r511, %r510;
+ div.s32 %r126, %r512, %r11;
+ add.s32 %r127, %r511, %r3;
+ shl.b32 %r128, %r9, 1;
+ shl.b32 %r513, %r4, 1;
+ mad.lo.s32 %r131, %r513, %r123, %r128;
+ or.b32 %r129, %r131, 1;
+ mul.lo.s32 %r130, %r513, %r11;
+ clz.b32 %r514, %r3;
+ mov.u32 %r515, 31;
+ sub.s32 %r516, %r515, %r514;
+ shl.b32 %r132, %r503, %r516;
+ setp.lt.u32 %p157, %r5, %r132;
+ add.s32 %r517, %r132, %r5;
+ setp.lt.u32 %p158, %r517, %r3;
and.pred %p7, %p157, %p158;
- add.s32 %r519, %r50, %r133;
- mul.wide.s32 %rd211, %r519, 4;
+ add.s32 %r518, %r49, %r132;
+ mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd45, %rd211;
- shr.u32 %r520, %r133, 31;
- add.s32 %r521, %r133, %r520;
- shr.s32 %r134, %r521, 1;
- add.s32 %r522, %r50, 1;
- mul.wide.u32 %rd213, %r522, 4;
+ shr.u32 %r519, %r132, 31;
+ add.s32 %r520, %r132, %r519;
+ shr.s32 %r133, %r520, 1;
+ add.s32 %r521, %r49, 1;
+ mul.wide.u32 %rd213, %r521, 4;
add.s64 %rd30, %rd45, %rd213;
cvta.to.global.u64 %rd31, %rd234;
cvta.to.global.u64 %rd32, %rd235;
- mov.u32 %r596, 0;
+ mov.u32 %r590, 0;
not.pred %p184, %p7;
bra.uni $L__BB0_240;
$L__BB0_303:
- add.s32 %r596, %r596, 1;
+ add.s32 %r590, %r590, 1;
$L__BB0_240:
.pragma "nounroll";
- setp.lt.s32 %p159, %r596, %r127;
+ setp.lt.s32 %p159, %r590, %r126;
@%p159 bra $L__BB0_274;
bra.uni $L__BB0_241;
$L__BB0_274:
- div.s32 %r160, %r128, %r3;
- setp.lt.s32 %p180, %r160, 1;
+ div.s32 %r159, %r127, %r3;
+ setp.lt.s32 %p180, %r159, 1;
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p180 bra $L__BB0_280;
- mul.lo.s32 %r537, %r131, %r596;
- add.s32 %r161, %r130, %r537;
- add.s32 %r162, %r132, %r537;
- mov.u32 %r536, 0;
+ mul.lo.s32 %r536, %r130, %r590;
+ add.s32 %r160, %r129, %r536;
+ add.s32 %r161, %r131, %r536;
+ mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
- mov.u32 %r605, %r536;
+ mov.u32 %r599, %r535;
$L__BB0_276:
.pragma "nounroll";
- setp.ge.s32 %p181, %r161, %r213;
- mov.u32 %r606, %r536;
- mov.u32 %r607, %r536;
+ setp.ge.s32 %p181, %r160, %r212;
+ mov.u32 %r600, %r535;
+ mov.u32 %r601, %r535;
@%p181 bra $L__BB0_279;
- mad.lo.s32 %r164, %r605, %r3, %r5;
- setp.ge.s32 %p182, %r164, %r11;
- mov.u32 %r606, %r536;
- mov.u32 %r607, %r536;
+ mad.lo.s32 %r163, %r599, %r3, %r5;
+ setp.ge.s32 %p182, %r163, %r11;
+ mov.u32 %r600, %r535;
+ mov.u32 %r601, %r535;
@%p182 bra $L__BB0_279;
- mad.lo.s32 %r544, %r164, %r213, %r162;
- mul.wide.s32 %rd225, %r544, 4;
+ mad.lo.s32 %r543, %r163, %r212, %r161;
+ mul.wide.s32 %rd225, %r543, 4;
add.s64 %rd224, %rd41, %rd225;
- ld.volatile.global.v2.s32 {%r607,%r606}, [%rd224];
+ ld.volatile.global.v2.s32 {%r601,%r600}, [%rd224];
$L__BB0_279:
- mov.b32 %f584, %r607;
+ mov.b32 %f584, %r601;
add.f32 %f685, %f685, %f584;
- mov.b32 %f585, %r606;
+ mov.b32 %f585, %r600;
add.f32 %f684, %f684, %f585;
- add.s32 %r605, %r605, 1;
- setp.lt.s32 %p183, %r605, %r160;
+ add.s32 %r599, %r599, 1;
+ setp.lt.s32 %p183, %r599, %r159;
@%p183 bra $L__BB0_276;
$L__BB0_280:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@@ -2013,33 +2007,33 @@
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_282:
- setp.lt.s32 %p185, %r133, 4;
+ setp.lt.s32 %p185, %r132, 4;
bar.sync 0;
@%p185 bra $L__BB0_287;
- mov.u32 %r608, %r134;
+ mov.u32 %r602, %r133;
$L__BB0_284:
- setp.ge.u32 %p186, %r5, %r608;
+ setp.ge.u32 %p186, %r5, %r602;
@%p186 bra $L__BB0_286;
- add.s32 %r545, %r608, %r50;
- mul.wide.s32 %rd226, %r545, 4;
+ add.s32 %r544, %r602, %r49;
+ mul.wide.s32 %rd226, %r544, 4;
add.s64 %rd228, %rd45, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_286:
bar.sync 0;
- shr.u32 %r171, %r608, 1;
- setp.gt.u32 %p187, %r608, 3;
- mov.u32 %r608, %r171;
+ shr.u32 %r170, %r602, 1;
+ setp.gt.u32 %p187, %r602, 3;
+ mov.u32 %r602, %r170;
@%p187 bra $L__BB0_284;
$L__BB0_287:
mov.f32 %f686, 0f00000000;
@%p8 bra $L__BB0_290;
@@ -2069,29 +2063,29 @@
$L__BB0_292:
bar.sync 0;
@%p185 bra $L__BB0_297;
- mov.u32 %r609, %r134;
+ mov.u32 %r603, %r133;
$L__BB0_294:
- setp.ge.u32 %p192, %r5, %r609;
+ setp.ge.u32 %p192, %r5, %r603;
@%p192 bra $L__BB0_296;
- add.s32 %r546, %r609, %r50;
- mul.wide.s32 %rd229, %r546, 4;
+ add.s32 %r545, %r603, %r49;
+ mul.wide.s32 %rd229, %r545, 4;
add.s64 %rd231, %rd45, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_296:
bar.sync 0;
- shr.u32 %r173, %r609, 1;
- setp.gt.u32 %p193, %r609, 3;
- mov.u32 %r609, %r173;
+ shr.u32 %r172, %r603, 1;
+ setp.gt.u32 %p193, %r603, 3;
+ mov.u32 %r603, %r172;
@%p193 bra $L__BB0_294;
$L__BB0_297:
mov.f32 %f687, 0f00000000;
@%p8 bra $L__BB0_300;
@@ -2110,74 +2104,74 @@
{ cvt.rn.f16.f32 %rs132, %f687;}
@%p8 bra $L__BB0_303;
- mul.lo.s32 %r174, %r131, %r596;
- add.s32 %r547, %r130, %r174;
- setp.ge.s32 %p197, %r547, %r213;
+ mul.lo.s32 %r173, %r130, %r590;
+ add.s32 %r546, %r129, %r173;
+ setp.ge.s32 %p197, %r546, %r212;
@%p197 bra $L__BB0_303;
- add.s32 %r548, %r132, %r174;
- mul.wide.s32 %rd232, %r548, 2;
+ add.s32 %r547, %r131, %r173;
+ mul.wide.s32 %rd232, %r547, 2;
add.s64 %rd233, %rd31, %rd232;
st.global.v2.u16 [%rd233], {%rs131, %rs132};
bra.uni $L__BB0_303;
$L__BB0_241:
- setp.lt.s32 %p160, %r127, 1;
+ setp.lt.s32 %p160, %r126, 1;
@%p160 bra $L__BB0_273;
- div.s32 %r136, %r128, %r3;
- mad.lo.s32 %r137, %r213, %r5, %r129;
- shl.b32 %r138, %r124, 1;
- shl.b32 %r139, %r11, 1;
- mul.lo.s32 %r140, %r213, %r3;
- mov.u32 %r597, 0;
+ div.s32 %r135, %r127, %r3;
+ mad.lo.s32 %r136, %r212, %r5, %r128;
+ shl.b32 %r137, %r123, 1;
+ shl.b32 %r138, %r11, 1;
+ mul.lo.s32 %r139, %r212, %r3;
+ mov.u32 %r591, 0;
$L__BB0_243:
.pragma "nounroll";
- setp.lt.s32 %p161, %r136, 1;
+ setp.lt.s32 %p161, %r135, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_249;
- mad.lo.s32 %r142, %r131, %r597, %r130;
- mad.lo.s32 %r525, %r139, %r597, %r138;
- mad.lo.s32 %r599, %r4, %r525, %r137;
- mov.u32 %r524, 0;
+ mad.lo.s32 %r141, %r130, %r591, %r129;
+ mad.lo.s32 %r524, %r138, %r591, %r137;
+ mad.lo.s32 %r593, %r4, %r524, %r136;
+ mov.u32 %r523, 0;
mov.f32 %f678, 0f00000000;
- mov.u32 %r598, %r5;
- mov.u32 %r600, %r524;
+ mov.u32 %r592, %r5;
+ mov.u32 %r594, %r523;
$L__BB0_245:
.pragma "nounroll";
- setp.ge.s32 %p162, %r142, %r213;
- mov.u32 %r601, %r524;
- mov.u32 %r602, %r524;
+ setp.ge.s32 %p162, %r141, %r212;
+ mov.u32 %r595, %r523;
+ mov.u32 %r596, %r523;
@%p162 bra $L__BB0_248;
- setp.ge.s32 %p163, %r598, %r11;
- mov.u32 %r601, %r524;
- mov.u32 %r602, %r524;
+ setp.ge.s32 %p163, %r592, %r11;
+ mov.u32 %r595, %r523;
+ mov.u32 %r596, %r523;
@%p163 bra $L__BB0_248;
- mul.wide.s32 %rd215, %r599, 4;
+ mul.wide.s32 %rd215, %r593, 4;
add.s64 %rd214, %rd42, %rd215;
- ld.volatile.global.v2.s32 {%r602,%r601}, [%rd214];
+ ld.volatile.global.v2.s32 {%r596,%r595}, [%rd214];
$L__BB0_248:
- mov.b32 %f558, %r602;
+ mov.b32 %f558, %r596;
add.f32 %f679, %f679, %f558;
- mov.b32 %f559, %r601;
+ mov.b32 %f559, %r595;
add.f32 %f678, %f678, %f559;
- add.s32 %r599, %r599, %r140;
- add.s32 %r598, %r598, %r3;
- add.s32 %r600, %r600, 1;
- setp.lt.s32 %p164, %r600, %r136;
+ add.s32 %r593, %r593, %r139;
+ add.s32 %r592, %r592, %r3;
+ add.s32 %r594, %r594, 1;
+ setp.lt.s32 %p164, %r594, %r135;
@%p164 bra $L__BB0_245;
$L__BB0_249:
st.shared.f32 [%rd23], %f679;
bar.sync 0;
@@ -2187,33 +2181,33 @@
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_251:
- setp.lt.s32 %p166, %r133, 4;
+ setp.lt.s32 %p166, %r132, 4;
bar.sync 0;
@%p166 bra $L__BB0_256;
- mov.u32 %r603, %r134;
+ mov.u32 %r597, %r133;
$L__BB0_253:
- setp.ge.u32 %p167, %r5, %r603;
+ setp.ge.u32 %p167, %r5, %r597;
@%p167 bra $L__BB0_255;
- add.s32 %r532, %r603, %r50;
- mul.wide.s32 %rd216, %r532, 4;
+ add.s32 %r531, %r597, %r49;
+ mul.wide.s32 %rd216, %r531, 4;
add.s64 %rd218, %rd45, %rd216;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd218];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_255:
bar.sync 0;
- shr.u32 %r155, %r603, 1;
- setp.gt.u32 %p168, %r603, 3;
- mov.u32 %r603, %r155;
+ shr.u32 %r154, %r597, 1;
+ setp.gt.u32 %p168, %r597, 3;
+ mov.u32 %r597, %r154;
@%p168 bra $L__BB0_253;
$L__BB0_256:
mov.f32 %f680, 0f00000000;
@%p8 bra $L__BB0_259;
@@ -2243,29 +2237,29 @@
$L__BB0_261:
bar.sync 0;
@%p166 bra $L__BB0_266;
- mov.u32 %r604, %r134;
+ mov.u32 %r598, %r133;
$L__BB0_263:
- setp.ge.u32 %p173, %r5, %r604;
+ setp.ge.u32 %p173, %r5, %r598;
@%p173 bra $L__BB0_265;
- add.s32 %r533, %r604, %r50;
- mul.wide.s32 %rd219, %r533, 4;
+ add.s32 %r532, %r598, %r49;
+ mul.wide.s32 %rd219, %r532, 4;
add.s64 %rd221, %rd45, %rd219;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd221];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_265:
bar.sync 0;
- shr.u32 %r157, %r604, 1;
- setp.gt.u32 %p174, %r604, 3;
- mov.u32 %r604, %r157;
+ shr.u32 %r156, %r598, 1;
+ setp.gt.u32 %p174, %r598, 3;
+ mov.u32 %r598, %r156;
@%p174 bra $L__BB0_263;
$L__BB0_266:
mov.f32 %f681, 0f00000000;
@%p8 bra $L__BB0_269;
@@ -2284,23 +2278,23 @@
{ cvt.rn.f16.f32 %rs130, %f681;}
@%p8 bra $L__BB0_272;
- mul.lo.s32 %r158, %r131, %r597;
- add.s32 %r534, %r130, %r158;
- setp.ge.s32 %p178, %r534, %r213;
+ mul.lo.s32 %r157, %r130, %r591;
+ add.s32 %r533, %r129, %r157;
+ setp.ge.s32 %p178, %r533, %r212;
@%p178 bra $L__BB0_272;
- add.s32 %r535, %r132, %r158;
- mul.wide.s32 %rd222, %r535, 2;
+ add.s32 %r534, %r131, %r157;
+ mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd32, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_272:
- add.s32 %r597, %r597, 1;
- setp.lt.s32 %p179, %r597, %r127;
+ add.s32 %r591, %r591, 1;
+ setp.lt.s32 %p179, %r591, %r126;
@%p179 bra $L__BB0_243;
$L__BB0_273:
ret;
16: CombinedSchedulerTest.LayerNormBackward/dtype___half_batch_216_hidden_768
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 60→ 56
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<779>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r202, %r203}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r212, %r213}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r216, %r217}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r238, %r203, 7;
shr.s32 %r239, %r238, 31;
shr.u32 %r240, %r239, 29;
add.s32 %r241, %r238, %r240;
shr.s32 %r2, %r241, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r242, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r243, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r244, %r4, %r2;
shl.b32 %r245, %r244, 4;
or.b32 %r246, %r245, 15;
and.b32 %r7, %r246, -16;
add.s32 %r247, %r246, %r7;
and.b32 %r248, %r247, -16;
cvt.s64.s32 %rd1, %r248;
max.s32 %r249, %r2, %r3;
add.s32 %r250, %r249, 31;
shr.s32 %r251, %r250, 31;
shr.u32 %r252, %r251, 27;
add.s32 %r253, %r250, %r252;
shr.u32 %r254, %r253, 5;
mul.lo.s32 %r255, %r4, %r254;
shl.b32 %r256, %r255, 7;
cvt.u64.u32 %rd2, %r256;
mov.u64 %rd44, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r257, %r8, 7;
setp.lt.s32 %p7, %r257, %r203;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
// end inline asm
shl.b32 %r261, %r5, 4;
add.s32 %r259, %r258, %r261;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r260, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r260, 0;
cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r735, %r6, 4;
add.s32 %r262, %r4, 215;
div.s32 %r263, %r262, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r264, %r11, %r263;
add.s32 %r265, %r264, -1;
div.s32 %r12, %r265, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r203;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r267, %ctaid.y;
mul.lo.s32 %r268, %r12, %r4;
mul.lo.s32 %r13, %r268, %r267;
shl.b32 %r269, %r9, 1;
shl.b32 %r270, %r5, 4;
mad.lo.s32 %r14, %r269, %r203, %r270;
mul.lo.s32 %r271, %r203, %r9;
cvt.s64.s32 %rd53, %r271;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r272, %r13, %r203;
cvt.s64.s32 %rd6, %r272;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
add.s32 %r15, %r271, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r275, %r274, %r16;
shr.u32 %r17, %r5, 5;
add.s32 %r276, %r275, %r17;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r18, %r5, 31;
add.s32 %r277, %r275, %r18;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r734, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r14, %r280;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r14, %r283;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r734, %r4;
add.s32 %r278, %r23, %r9;
add.s32 %r24, %r278, %r13;
setp.gt.s32 %p13, %r24, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r24, %r212;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r24, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r23, %r203;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r24, %r216;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ cvt.f32.f16 %f234, %rs36;}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ cvt.f32.f16 %f235, %rs37;}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ cvt.f32.f16 %f236, %rs38;}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f237, %rs39;}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ cvt.f32.f16 %f238, %rs40;}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ cvt.f32.f16 %f239, %rs41;}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ cvt.f32.f16 %f240, %rs42;}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ cvt.f32.f16 %f241, %rs43;}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ cvt.f32.f16 %f242, %rs44;}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ cvt.f32.f16 %f243, %rs45;}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ cvt.f32.f16 %f244, %rs46;}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ cvt.f32.f16 %f245, %rs47;}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ cvt.f32.f16 %f246, %rs48;}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ cvt.f32.f16 %f247, %rs49;}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ cvt.f32.f16 %f248, %rs50;}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ cvt.f32.f16 %f249, %rs51;}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ cvt.f32.f16 %f250, %rs52;}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ cvt.f32.f16 %f251, %rs53;}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ cvt.f32.f16 %f252, %rs54;}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ cvt.f32.f16 %f253, %rs55;}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ cvt.f32.f16 %f254, %rs56;}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ cvt.f32.f16 %f255, %rs57;}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ cvt.f32.f16 %f256, %rs58;}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ cvt.f32.f16 %f257, %rs59;}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r735, %r735, 2;
bar.sync 0;
setp.ne.s32 %p23, %r18, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r17, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r18, %r16;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r18, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r18, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r17, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r18, %r16;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r18, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ cvt.f32.f16 %f374, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ cvt.f32.f16 %f375, %rs98;}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ cvt.f32.f16 %f376, %rs99;}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ cvt.f32.f16 %f378, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f379, %rs102;}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ cvt.f32.f16 %f380, %rs103;}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ cvt.f32.f16 %f382, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ cvt.f32.f16 %f383, %rs106;}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ cvt.f32.f16 %f384, %rs107;}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ cvt.f32.f16 %f386, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f387, %rs110;}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ cvt.f32.f16 %f388, %rs111;}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ cvt.f32.f16 %f390, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ cvt.f32.f16 %f391, %rs114;}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ cvt.f32.f16 %f392, %rs115;}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ cvt.f32.f16 %f394, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f395, %rs118;}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ cvt.f32.f16 %f396, %rs119;}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ cvt.f32.f16 %f398, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ cvt.f32.f16 %f399, %rs122;}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ cvt.f32.f16 %f400, %rs123;}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ cvt.f32.f16 %f402, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f403, %rs126;}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ cvt.f32.f16 %f404, %rs127;}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
add.s32 %r416, %r13, %r732;
mad.lo.s32 %r417, %r416, %r203, %r15;
mul.wide.s32 %rd76, %r417, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r734, %r734, 1;
setp.lt.s32 %p49, %r734, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r418, %tid.z;
mad.lo.s32 %r46, %r418, %r4, %r9;
mad.lo.s32 %r47, %r46, %r3, %r5;
mul.wide.u32 %rd77, %r47, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r419, %r4;
mov.u32 %r420, 31;
sub.s32 %r48, %r420, %r419;
mov.u32 %r421, 1;
shl.b32 %r766, %r421, %r48;
setp.lt.u32 %p50, %r9, %r766;
add.s32 %r422, %r766, %r9;
setp.lt.u32 %p51, %r422, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r423, %r3, %r48;
add.s32 %r424, %r47, %r423;
mul.wide.s32 %rd79, %r424, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r766, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r736, %r766;
$L__BB0_40:
shr.u32 %r51, %r736, 1;
setp.ge.u32 %p54, %r9, %r51;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r425, %r51, %r3, %r47;
mul.wide.s32 %rd82, %r425, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r736, 7;
mov.u32 %r736, %r51;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r737, 0;
add.s32 %r427, %r47, %r3;
mul.wide.u32 %rd85, %r427, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r737, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r428, %r3, %r48;
add.s32 %r429, %r47, %r428;
mul.wide.s32 %rd87, %r429, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r738, %r766;
$L__BB0_51:
shr.u32 %r55, %r738, 1;
setp.ge.u32 %p60, %r9, %r55;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r430, %r55, %r3, %r47;
mul.wide.s32 %rd90, %r430, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r738, 7;
mov.u32 %r738, %r55;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r739, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r432, %r3, %r48;
add.s32 %r433, %r47, %r432;
mul.wide.s32 %rd93, %r433, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r740, %r766;
$L__BB0_62:
shr.u32 %r59, %r740, 1;
setp.ge.u32 %p66, %r9, %r59;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r434, %r59, %r3, %r47;
mul.wide.s32 %rd96, %r434, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r740, 7;
mov.u32 %r740, %r59;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r741, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r436, %r3, %r48;
add.s32 %r437, %r47, %r436;
mul.wide.s32 %rd99, %r437, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r742, %r766;
$L__BB0_73:
shr.u32 %r63, %r742, 1;
setp.ge.u32 %p72, %r9, %r63;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r438, %r63, %r3, %r47;
mul.wide.s32 %rd102, %r438, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r742, 7;
mov.u32 %r742, %r63;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r743, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r440, %r3, %r48;
add.s32 %r441, %r47, %r440;
mul.wide.s32 %rd105, %r441, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r744, %r766;
$L__BB0_84:
shr.u32 %r67, %r744, 1;
setp.ge.u32 %p78, %r9, %r67;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r442, %r67, %r3, %r47;
mul.wide.s32 %rd108, %r442, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r744, 7;
mov.u32 %r744, %r67;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r745, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r444, %r3, %r48;
add.s32 %r445, %r47, %r444;
mul.wide.s32 %rd111, %r445, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r746, %r766;
$L__BB0_95:
shr.u32 %r71, %r746, 1;
setp.ge.u32 %p84, %r9, %r71;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r446, %r71, %r3, %r47;
mul.wide.s32 %rd114, %r446, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r746, 7;
mov.u32 %r746, %r71;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r747, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r448, %r3, %r48;
add.s32 %r449, %r47, %r448;
mul.wide.s32 %rd117, %r449, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r748, %r766;
$L__BB0_106:
shr.u32 %r75, %r748, 1;
setp.ge.u32 %p90, %r9, %r75;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r450, %r75, %r3, %r47;
mul.wide.s32 %rd120, %r450, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r748, 7;
mov.u32 %r748, %r75;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r749, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r452, %r3, %r48;
add.s32 %r453, %r47, %r452;
mul.wide.s32 %rd123, %r453, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r750, %r766;
$L__BB0_117:
shr.u32 %r79, %r750, 1;
setp.ge.u32 %p96, %r9, %r79;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r454, %r79, %r3, %r47;
mul.wide.s32 %rd126, %r454, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r750, 7;
mov.u32 %r750, %r79;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r751, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r82, %r735, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r456, %r3, %r48;
add.s32 %r457, %r47, %r456;
mul.wide.s32 %rd129, %r457, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r752, %r766;
$L__BB0_128:
shr.u32 %r84, %r752, 1;
setp.ge.u32 %p102, %r9, %r84;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r458, %r84, %r3, %r47;
mul.wide.s32 %rd132, %r458, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r752, 7;
mov.u32 %r752, %r84;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r753, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r460, %r3, %r48;
add.s32 %r461, %r47, %r460;
mul.wide.s32 %rd135, %r461, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r754, %r766;
$L__BB0_139:
shr.u32 %r88, %r754, 1;
setp.ge.u32 %p108, %r9, %r88;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r462, %r88, %r3, %r47;
mul.wide.s32 %rd138, %r462, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r754, 7;
mov.u32 %r754, %r88;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r755, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r464, %r3, %r48;
add.s32 %r465, %r47, %r464;
mul.wide.s32 %rd141, %r465, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r756, %r766;
$L__BB0_150:
shr.u32 %r92, %r756, 1;
setp.ge.u32 %p114, %r9, %r92;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r466, %r92, %r3, %r47;
mul.wide.s32 %rd144, %r466, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r756, 7;
mov.u32 %r756, %r92;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r757, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r468, %r3, %r48;
add.s32 %r469, %r47, %r468;
mul.wide.s32 %rd147, %r469, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r758, %r766;
$L__BB0_161:
shr.u32 %r96, %r758, 1;
setp.ge.u32 %p120, %r9, %r96;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r470, %r96, %r3, %r47;
mul.wide.s32 %rd150, %r470, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r758, 7;
mov.u32 %r758, %r96;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r759, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r472, %r3, %r48;
add.s32 %r473, %r47, %r472;
mul.wide.s32 %rd153, %r473, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r760, %r766;
$L__BB0_172:
shr.u32 %r100, %r760, 1;
setp.ge.u32 %p126, %r9, %r100;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r474, %r100, %r3, %r47;
mul.wide.s32 %rd156, %r474, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r760, 7;
mov.u32 %r760, %r100;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r761, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r476, %r3, %r48;
add.s32 %r477, %r47, %r476;
mul.wide.s32 %rd159, %r477, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r762, %r766;
$L__BB0_183:
shr.u32 %r104, %r762, 1;
setp.ge.u32 %p132, %r9, %r104;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r478, %r104, %r3, %r47;
mul.wide.s32 %rd162, %r478, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r762, 7;
mov.u32 %r762, %r104;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r763, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r763, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r480, %r3, %r48;
add.s32 %r481, %r47, %r480;
mul.wide.s32 %rd165, %r481, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r764, %r766;
$L__BB0_194:
shr.u32 %r108, %r764, 1;
setp.ge.u32 %p138, %r9, %r108;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r482, %r108, %r3, %r47;
mul.wide.s32 %rd168, %r482, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r764, 7;
mov.u32 %r764, %r108;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r765, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r765, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r484, %r3, %r48;
add.s32 %r485, %r47, %r484;
mul.wide.s32 %rd171, %r485, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r112, %r766, 1;
setp.ge.u32 %p144, %r9, %r112;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r486, %r112, %r3, %r47;
mul.wide.s32 %rd174, %r486, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r766, 7;
mov.u32 %r766, %r112;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r767, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r767, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
shl.b32 %r731, %r5, 3;
mov.u32 %r512, %ctaid.y;
mad.lo.s32 %r513, %r203, %r512, %r731;
add.s32 %r514, %r513, %r82;
mul.wide.s32 %rd183, %r514, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
// end inline asm
add.s32 %r515, %r514, 4;
mul.wide.s32 %rd184, %r515, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r488, %r728, 3;
sub.s32 %r115, %r488, %r203;
mov.u32 %r489, %ctaid.y;
mad.lo.s32 %r116, %r203, %r489, %r728;
neg.s32 %r490, %r82;
setp.ge.s32 %p151, %r115, %r490;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r495, %r116, %r82;
mul.wide.s32 %rd178, %r495, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
// end inline asm
$L__BB0_214:
mov.u32 %r496, -4;
sub.s32 %r497, %r496, %r82;
setp.ge.s32 %p153, %r115, %r497;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r502, %r116, %r82;
add.s32 %r503, %r502, 4;
mul.wide.s32 %rd180, %r503, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r730, %r5, 3;
shl.b32 %r540, %r735, 5;
mov.u32 %r541, %ctaid.y;
mad.lo.s32 %r542, %r203, %r541, %r730;
add.s32 %r543, %r542, %r540;
mul.wide.s32 %rd191, %r543, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
// end inline asm
add.s32 %r544, %r543, 4;
mul.wide.s32 %rd192, %r544, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r516, %r729, 3;
sub.s32 %r117, %r516, %r203;
mov.u32 %r517, %ctaid.y;
mad.lo.s32 %r118, %r203, %r517, %r729;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r119, %r735, 5;
neg.s32 %r518, %r119;
setp.ge.s32 %p160, %r117, %r518;
@%p160 bra $L__BB0_222;
add.s32 %r523, %r118, %r119;
mul.wide.s32 %rd186, %r523, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r120, %r735, 5;
mov.u32 %r524, -4;
sub.s32 %r525, %r524, %r120;
setp.ge.s32 %p162, %r117, %r525;
@%p162 bra $L__BB0_227;
add.s32 %r530, %r118, %r120;
add.s32 %r531, %r530, 4;
mul.wide.s32 %rd188, %r531, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
// end inline asm
$L__BB0_227:
mov.u32 %r121, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r545, %r5, %r9;
or.b32 %r547, %r545, %r418;
setp.ne.s32 %p164, %r547, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r548, %ctaid.x;
mov.u32 %r549, %ctaid.z;
mov.u32 %r550, %nctaid.x;
mad.lo.s32 %r551, %r549, %r550, %r548;
mul.wide.s32 %rd194, %r551, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r552, %r11, -1;
setp.eq.s32 %p165, %r121, %r552;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r768, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r768;
// end inline asm
setp.lt.u32 %p167, %r768, 256;
selp.u32 %r555, 1, 0, %p167;
shl.b32 %r768, %r768, %r555;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_dabd7834_1033910nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r557, %r203, 1;
shr.u32 %r558, %r557, 31;
add.s32 %r559, %r557, %r558;
shr.s32 %r560, %r559, 1;
add.s32 %r561, %r4, %r560;
add.s32 %r562, %r561, -1;
div.s32 %r563, %r562, %r4;
add.s32 %r564, %r11, -1;
add.s32 %r565, %r564, %r563;
div.s32 %r124, %r565, %r11;
add.s32 %r125, %r564, %r3;
shl.b32 %r126, %r9, 1;
shl.b32 %r566, %r4, 1;
mad.lo.s32 %r129, %r566, %r121, %r126;
or.b32 %r127, %r129, 1;
mul.lo.s32 %r128, %r566, %r11;
shr.u32 %r130, %r3, 5;
mul.lo.s32 %r567, %r46, %r130;
shr.u32 %r131, %r5, 5;
add.s32 %r568, %r567, %r131;
mul.wide.u32 %rd203, %r568, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r132, %r5, 31;
add.s32 %r569, %r567, %r132;
mul.wide.u32 %rd205, %r569, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r769, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r769, %r769, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r769, %r124;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r154, %r125, %r3;
setp.lt.s32 %p206, %r154, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r650, %r128, %r769;
add.s32 %r155, %r127, %r650;
add.s32 %r156, %r129, %r650;
mov.u32 %r649, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r776, %r649;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r155, %r203;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r158, %r776, %r3, %r5;
setp.ge.s32 %p208, %r158, %r11;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r657, %r158, %r203, %r156;
mul.wide.s32 %rd211, %r657, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r778;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r777;
add.f32 %f769, %f769, %f643;
add.s32 %r776, %r776, 1;
setp.lt.s32 %p209, %r776, %r154;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r658, %f770;
mov.u32 %r659, 31;
mov.u32 %r660, 16;
mov.u32 %r661, -1;
shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
mov.b32 %f644, %r662;
add.f32 %f645, %f770, %f644;
mov.b32 %r663, %f645;
mov.u32 %r664, 8;
shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
mov.b32 %f646, %r665;
add.f32 %f647, %f645, %f646;
mov.b32 %r666, %f647;
mov.u32 %r667, 4;
shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
mov.b32 %f648, %r668;
add.f32 %f649, %f647, %f648;
mov.b32 %r669, %f649;
mov.u32 %r670, 2;
shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
mov.b32 %f650, %r671;
add.f32 %f651, %f649, %f650;
mov.b32 %r672, %f651;
mov.u32 %r673, 1;
shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
mov.b32 %f652, %r674;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r132, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r131, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r132, %r130;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r675, %f771;
mov.u32 %r676, 31;
mov.u32 %r677, 16;
mov.u32 %r678, -1;
shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
mov.b32 %f654, %r679;
add.f32 %f655, %f771, %f654;
mov.b32 %r680, %f655;
mov.u32 %r681, 8;
shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
mov.b32 %f656, %r682;
add.f32 %f657, %f655, %f656;
mov.b32 %r683, %f657;
mov.u32 %r684, 4;
shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
mov.b32 %f658, %r685;
add.f32 %f659, %f657, %f658;
mov.b32 %r686, %f659;
mov.u32 %r687, 2;
shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
mov.b32 %f660, %r688;
add.f32 %f661, %f659, %f660;
mov.b32 %r689, %f661;
mov.u32 %r690, 1;
shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
mov.b32 %f662, %r691;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r132, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r692, %f769;
mov.u32 %r693, 31;
mov.u32 %r694, 16;
mov.u32 %r695, -1;
shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
mov.b32 %f665, %r696;
add.f32 %f666, %f769, %f665;
mov.b32 %r697, %f666;
mov.u32 %r698, 8;
shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
mov.b32 %f667, %r699;
add.f32 %f668, %f666, %f667;
mov.b32 %r700, %f668;
mov.u32 %r701, 4;
shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
mov.b32 %f669, %r702;
add.f32 %f670, %f668, %f669;
mov.b32 %r703, %f670;
mov.u32 %r704, 2;
shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
mov.b32 %f671, %r705;
add.f32 %f672, %f670, %f671;
mov.b32 %r706, %f672;
mov.u32 %r707, 1;
shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
mov.b32 %f673, %r708;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r132, %r130;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r709, %f773;
mov.u32 %r710, 31;
mov.u32 %r711, 16;
mov.u32 %r712, -1;
shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
mov.b32 %f675, %r713;
add.f32 %f676, %f773, %f675;
mov.b32 %r714, %f676;
mov.u32 %r715, 8;
shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
mov.b32 %f677, %r716;
add.f32 %f678, %f676, %f677;
mov.b32 %r717, %f678;
mov.u32 %r718, 4;
shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
mov.b32 %f679, %r719;
add.f32 %f680, %f678, %f679;
mov.b32 %r720, %f680;
mov.u32 %r721, 2;
shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
mov.b32 %f681, %r722;
add.f32 %f682, %f680, %f681;
mov.b32 %r723, %f682;
mov.u32 %r724, 1;
shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
mov.b32 %f683, %r725;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r164, %r128, %r769;
add.s32 %r726, %r127, %r164;
setp.ge.s32 %p239, %r726, %r203;
@%p239 bra $L__BB0_279;
add.s32 %r727, %r129, %r164;
mul.wide.s32 %rd212, %r727, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r124, 1;
@%p170 bra $L__BB0_257;
div.s32 %r134, %r125, %r3;
mad.lo.s32 %r135, %r203, %r5, %r126;
shl.b32 %r136, %r121, 1;
shl.b32 %r137, %r11, 1;
mul.lo.s32 %r138, %r203, %r3;
mov.u32 %r770, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r134, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r140, %r128, %r770, %r127;
mad.lo.s32 %r572, %r137, %r770, %r136;
mad.lo.s32 %r772, %r4, %r572, %r135;
mov.u32 %r571, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r771, %r5;
mov.u32 %r773, %r571;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r140, %r203;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r771, %r11;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r772, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r775;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r774;
add.f32 %f761, %f761, %f595;
add.s32 %r772, %r772, %r138;
add.s32 %r771, %r771, %r3;
add.s32 %r773, %r773, 1;
setp.lt.s32 %p174, %r773, %r134;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r579, %f762;
mov.u32 %r580, 31;
mov.u32 %r581, 16;
mov.u32 %r582, -1;
shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
mov.b32 %f596, %r583;
add.f32 %f597, %f762, %f596;
mov.b32 %r584, %f597;
mov.u32 %r585, 8;
shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
mov.b32 %f598, %r586;
add.f32 %f599, %f597, %f598;
mov.b32 %r587, %f599;
mov.u32 %r588, 4;
shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
mov.b32 %f600, %r589;
add.f32 %f601, %f599, %f600;
mov.b32 %r590, %f601;
mov.u32 %r591, 2;
shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
mov.b32 %f602, %r592;
add.f32 %f603, %f601, %f602;
mov.b32 %r593, %f603;
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
mov.b32 %f604, %r595;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r132, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r131, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r132, %r130;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r596, %f763;
mov.u32 %r597, 31;
mov.u32 %r598, 16;
mov.u32 %r599, -1;
shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
mov.b32 %f606, %r600;
add.f32 %f607, %f763, %f606;
mov.b32 %r601, %f607;
mov.u32 %r602, 8;
shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
mov.b32 %f608, %r603;
add.f32 %f609, %f607, %f608;
mov.b32 %r604, %f609;
mov.u32 %r605, 4;
shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
mov.b32 %f610, %r606;
add.f32 %f611, %f609, %f610;
mov.b32 %r607, %f611;
mov.u32 %r608, 2;
shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
mov.b32 %f612, %r609;
add.f32 %f613, %f611, %f612;
mov.b32 %r610, %f613;
mov.u32 %r611, 1;
shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
mov.b32 %f614, %r612;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r132, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r613, %f761;
mov.u32 %r614, 31;
mov.u32 %r615, 16;
mov.u32 %r616, -1;
shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
mov.b32 %f617, %r617;
add.f32 %f618, %f761, %f617;
mov.b32 %r618, %f618;
mov.u32 %r619, 8;
shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
mov.b32 %f619, %r620;
add.f32 %f620, %f618, %f619;
mov.b32 %r621, %f620;
mov.u32 %r622, 4;
shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
mov.b32 %f621, %r623;
add.f32 %f622, %f620, %f621;
mov.b32 %r624, %f622;
mov.u32 %r625, 2;
shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
mov.b32 %f623, %r626;
add.f32 %f624, %f622, %f623;
mov.b32 %r627, %f624;
mov.u32 %r628, 1;
shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
mov.b32 %f625, %r629;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r132, %r130;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r630, %f765;
mov.u32 %r631, 31;
mov.u32 %r632, 16;
mov.u32 %r633, -1;
shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
mov.b32 %f627, %r634;
add.f32 %f628, %f765, %f627;
mov.b32 %r635, %f628;
mov.u32 %r636, 8;
shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
mov.b32 %f629, %r637;
add.f32 %f630, %f628, %f629;
mov.b32 %r638, %f630;
mov.u32 %r639, 4;
shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
mov.b32 %f631, %r640;
add.f32 %f632, %f630, %f631;
mov.b32 %r641, %f632;
mov.u32 %r642, 2;
shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
mov.b32 %f633, %r643;
add.f32 %f634, %f632, %f633;
mov.b32 %r644, %f634;
mov.u32 %r645, 1;
shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
mov.b32 %f635, %r646;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r152, %r128, %r770;
add.s32 %r647, %r127, %r152;
setp.ge.s32 %p204, %r647, %r203;
@%p204 bra $L__BB0_256;
add.s32 %r648, %r129, %r152;
mul.wide.s32 %rd208, %r648, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r770, %r770, 1;
setp.lt.s32 %p205, %r770, %r124;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
max.s32 %r248, %r2, %r3;
add.s32 %r249, %r248, 31;
shr.s32 %r250, %r249, 31;
shr.u32 %r251, %r250, 27;
add.s32 %r252, %r249, %r251;
shr.u32 %r253, %r252, 5;
mul.lo.s32 %r254, %r4, %r253;
shl.b32 %r255, %r254, 7;
cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r256, %r8, 7;
setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
shl.b32 %r260, %r5, 4;
add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r729, %r6, 4;
add.s32 %r261, %r4, 215;
div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r263, %r11, %r262;
add.s32 %r264, %r263, -1;
div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r266, %ctaid.y;
mul.lo.s32 %r267, %r12, %r4;
mul.lo.s32 %r13, %r267, %r266;
mad.lo.s32 %r268, %r2, %r9, %r5;
shl.b32 %r14, %r268, 4;
mul.lo.s32 %r269, %r202, %r9;
cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r270, %r13, %r202;
cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
shl.b32 %r271, %r9, 3;
mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r275, %r274, %r15;
shr.u32 %r16, %r5, 5;
add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r17, %r5, 31;
add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r280, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r728, %r4;
add.s32 %r278, %r22, %r9;
add.s32 %r23, %r278, %r13;
setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ cvt.f32.f16 %f234, %rs36;}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ cvt.f32.f16 %f235, %rs37;}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ cvt.f32.f16 %f236, %rs38;}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f237, %rs39;}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ cvt.f32.f16 %f238, %rs40;}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ cvt.f32.f16 %f239, %rs41;}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ cvt.f32.f16 %f240, %rs42;}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ cvt.f32.f16 %f241, %rs43;}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ cvt.f32.f16 %f242, %rs44;}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ cvt.f32.f16 %f243, %rs45;}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ cvt.f32.f16 %f244, %rs46;}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ cvt.f32.f16 %f245, %rs47;}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ cvt.f32.f16 %f246, %rs48;}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ cvt.f32.f16 %f247, %rs49;}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ cvt.f32.f16 %f248, %rs50;}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ cvt.f32.f16 %f249, %rs51;}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ cvt.f32.f16 %f250, %rs52;}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ cvt.f32.f16 %f251, %rs53;}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ cvt.f32.f16 %f252, %rs54;}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ cvt.f32.f16 %f253, %rs55;}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ cvt.f32.f16 %f254, %rs56;}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ cvt.f32.f16 %f255, %rs57;}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ cvt.f32.f16 %f256, %rs58;}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ cvt.f32.f16 %f257, %rs59;}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r729, %r729, 2;
bar.sync 0;
setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ cvt.f32.f16 %f374, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ cvt.f32.f16 %f375, %rs98;}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ cvt.f32.f16 %f376, %rs99;}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ cvt.f32.f16 %f378, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f379, %rs102;}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ cvt.f32.f16 %f380, %rs103;}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ cvt.f32.f16 %f382, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ cvt.f32.f16 %f383, %rs106;}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ cvt.f32.f16 %f384, %rs107;}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ cvt.f32.f16 %f386, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f387, %rs110;}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ cvt.f32.f16 %f388, %rs111;}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ cvt.f32.f16 %f390, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ cvt.f32.f16 %f391, %rs114;}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ cvt.f32.f16 %f392, %rs115;}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ cvt.f32.f16 %f394, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f395, %rs118;}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ cvt.f32.f16 %f396, %rs119;}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ cvt.f32.f16 %f398, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ cvt.f32.f16 %f399, %rs122;}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ cvt.f32.f16 %f400, %rs123;}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ cvt.f32.f16 %f402, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f403, %rs126;}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ cvt.f32.f16 %f404, %rs127;}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
mad.lo.s32 %r416, %r23, %r202, %r8;
mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r728, %r728, 1;
setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r417, %tid.z;
mad.lo.s32 %r45, %r417, %r4, %r9;
mad.lo.s32 %r46, %r45, %r3, %r5;
mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r418, %r4;
mov.u32 %r419, 31;
sub.s32 %r47, %r419, %r418;
mov.u32 %r420, 1;
shl.b32 %r760, %r420, %r47;
setp.lt.u32 %p50, %r9, %r760;
add.s32 %r421, %r760, %r9;
setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r422, %r3, %r47;
add.s32 %r423, %r46, %r422;
mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r730, %r760;
$L__BB0_40:
shr.u32 %r50, %r730, 1;
setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r424, %r50, %r3, %r46;
mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r730, 7;
mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r731, 0;
add.s32 %r426, %r46, %r3;
mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r427, %r3, %r47;
add.s32 %r428, %r46, %r427;
mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r732, %r760;
$L__BB0_51:
shr.u32 %r54, %r732, 1;
setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r429, %r54, %r3, %r46;
mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r732, 7;
mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r431, %r3, %r47;
add.s32 %r432, %r46, %r431;
mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r734, %r760;
$L__BB0_62:
shr.u32 %r58, %r734, 1;
setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r433, %r58, %r3, %r46;
mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r734, 7;
mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r435, %r3, %r47;
add.s32 %r436, %r46, %r435;
mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r736, %r760;
$L__BB0_73:
shr.u32 %r62, %r736, 1;
setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r437, %r62, %r3, %r46;
mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r736, 7;
mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r439, %r3, %r47;
add.s32 %r440, %r46, %r439;
mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r738, %r760;
$L__BB0_84:
shr.u32 %r66, %r738, 1;
setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r441, %r66, %r3, %r46;
mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r738, 7;
mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r443, %r3, %r47;
add.s32 %r444, %r46, %r443;
mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r740, %r760;
$L__BB0_95:
shr.u32 %r70, %r740, 1;
setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r445, %r70, %r3, %r46;
mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r740, 7;
mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r447, %r3, %r47;
add.s32 %r448, %r46, %r447;
mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r742, %r760;
$L__BB0_106:
shr.u32 %r74, %r742, 1;
setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r449, %r74, %r3, %r46;
mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r742, 7;
mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r451, %r3, %r47;
add.s32 %r452, %r46, %r451;
mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r744, %r760;
$L__BB0_117:
shr.u32 %r78, %r744, 1;
setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r453, %r78, %r3, %r46;
mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r744, 7;
mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r455, %r3, %r47;
add.s32 %r456, %r46, %r455;
mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r746, %r760;
$L__BB0_128:
shr.u32 %r83, %r746, 1;
setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r457, %r83, %r3, %r46;
mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r746, 7;
mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r459, %r3, %r47;
add.s32 %r460, %r46, %r459;
mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r748, %r760;
$L__BB0_139:
shr.u32 %r87, %r748, 1;
setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r461, %r87, %r3, %r46;
mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r748, 7;
mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r463, %r3, %r47;
add.s32 %r464, %r46, %r463;
mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r750, %r760;
$L__BB0_150:
shr.u32 %r91, %r750, 1;
setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r465, %r91, %r3, %r46;
mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r750, 7;
mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r467, %r3, %r47;
add.s32 %r468, %r46, %r467;
mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r752, %r760;
$L__BB0_161:
shr.u32 %r95, %r752, 1;
setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r469, %r95, %r3, %r46;
mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r752, 7;
mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r471, %r3, %r47;
add.s32 %r472, %r46, %r471;
mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r754, %r760;
$L__BB0_172:
shr.u32 %r99, %r754, 1;
setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r473, %r99, %r3, %r46;
mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r754, 7;
mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r475, %r3, %r47;
add.s32 %r476, %r46, %r475;
mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r756, %r760;
$L__BB0_183:
shr.u32 %r103, %r756, 1;
setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r477, %r103, %r3, %r46;
mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r756, 7;
mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r479, %r3, %r47;
add.s32 %r480, %r46, %r479;
mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r758, %r760;
$L__BB0_194:
shr.u32 %r107, %r758, 1;
setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r481, %r107, %r3, %r46;
mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r758, 7;
mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r483, %r3, %r47;
add.s32 %r484, %r46, %r483;
mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r111, %r760, 1;
setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r485, %r111, %r3, %r46;
mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r760, 7;
mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
mov.u32 %r511, %ctaid.y;
mad.lo.s32 %r512, %r202, %r511, %r8;
add.s32 %r513, %r512, %r81;
mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
// end inline asm
add.s32 %r514, %r513, 4;
mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r487, %r8, 3;
sub.s32 %r114, %r487, %r202;
mov.u32 %r488, %ctaid.y;
mad.lo.s32 %r115, %r202, %r488, %r8;
neg.s32 %r489, %r81;
setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r494, %r115, %r81;
mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
// end inline asm
$L__BB0_214:
mov.u32 %r495, -4;
sub.s32 %r496, %r495, %r81;
setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r501, %r115, %r81;
add.s32 %r502, %r501, 4;
mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r539, %r729, 5;
mov.u32 %r540, %ctaid.y;
mad.lo.s32 %r541, %r202, %r540, %r8;
add.s32 %r542, %r541, %r539;
mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
// end inline asm
add.s32 %r543, %r542, 4;
mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r515, %r8, 3;
sub.s32 %r116, %r515, %r202;
mov.u32 %r516, %ctaid.y;
mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r118, %r729, 5;
neg.s32 %r517, %r118;
setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
add.s32 %r522, %r117, %r118;
mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r119, %r729, 5;
mov.u32 %r523, -4;
sub.s32 %r524, %r523, %r119;
setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
add.s32 %r529, %r117, %r119;
add.s32 %r530, %r529, 4;
mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
// end inline asm
$L__BB0_227:
mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r544, %r5, %r9;
or.b32 %r546, %r544, %r417;
setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r547, %ctaid.x;
mov.u32 %r548, %ctaid.z;
mov.u32 %r549, %nctaid.x;
mad.lo.s32 %r550, %r548, %r549, %r547;
mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r551, %r11, -1;
setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r762, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r762;
// end inline asm
setp.lt.u32 %p167, %r762, 256;
selp.u32 %r554, 1, 0, %p167;
shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_31_cu_c94510e2_723310nvfuser_31ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r556, %r202, 1;
shr.u32 %r557, %r556, 31;
add.s32 %r558, %r556, %r557;
shr.s32 %r559, %r558, 1;
add.s32 %r560, %r4, %r559;
add.s32 %r561, %r560, -1;
div.s32 %r562, %r561, %r4;
add.s32 %r563, %r11, -1;
add.s32 %r564, %r563, %r562;
div.s32 %r123, %r564, %r11;
add.s32 %r124, %r563, %r3;
shl.b32 %r125, %r9, 1;
shl.b32 %r565, %r4, 1;
mad.lo.s32 %r128, %r565, %r120, %r125;
or.b32 %r126, %r128, 1;
mul.lo.s32 %r127, %r565, %r11;
shr.u32 %r129, %r3, 5;
mul.lo.s32 %r566, %r45, %r129;
shr.u32 %r130, %r5, 5;
add.s32 %r567, %r566, %r130;
mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r131, %r5, 31;
add.s32 %r568, %r566, %r131;
mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r153, %r124, %r3;
setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r649, %r127, %r763;
add.s32 %r154, %r126, %r649;
add.s32 %r155, %r128, %r649;
mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r154, %r202;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r157, %r770, %r3, %r5;
setp.ge.s32 %p208, %r157, %r11;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r656, %r157, %r202, %r155;
mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
add.s32 %r770, %r770, 1;
setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r657, %f770;
mov.u32 %r658, 31;
mov.u32 %r659, 16;
mov.u32 %r660, -1;
shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
mov.b32 %r662, %f645;
mov.u32 %r663, 8;
shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
mov.b32 %r665, %f647;
mov.u32 %r666, 4;
shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
mov.b32 %r668, %f649;
mov.u32 %r669, 2;
shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
mov.b32 %r671, %f651;
mov.u32 %r672, 1;
shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r674, %f771;
mov.u32 %r675, 31;
mov.u32 %r676, 16;
mov.u32 %r677, -1;
shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
mov.b32 %r679, %f655;
mov.u32 %r680, 8;
shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
mov.b32 %r682, %f657;
mov.u32 %r683, 4;
shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
mov.b32 %r685, %f659;
mov.u32 %r686, 2;
shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
mov.b32 %r688, %f661;
mov.u32 %r689, 1;
shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r691, %f769;
mov.u32 %r692, 31;
mov.u32 %r693, 16;
mov.u32 %r694, -1;
shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
mov.b32 %r696, %f666;
mov.u32 %r697, 8;
shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
mov.b32 %r699, %f668;
mov.u32 %r700, 4;
shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
mov.b32 %r702, %f670;
mov.u32 %r703, 2;
shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
mov.b32 %r705, %f672;
mov.u32 %r706, 1;
shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r708, %f773;
mov.u32 %r709, 31;
mov.u32 %r710, 16;
mov.u32 %r711, -1;
shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
mov.b32 %r713, %f676;
mov.u32 %r714, 8;
shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
mov.b32 %r716, %f678;
mov.u32 %r717, 4;
shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
mov.b32 %r719, %f680;
mov.u32 %r720, 2;
shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
mov.b32 %r722, %f682;
mov.u32 %r723, 1;
shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r163, %r127, %r763;
add.s32 %r725, %r126, %r163;
setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
add.s32 %r726, %r128, %r163;
mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
div.s32 %r133, %r124, %r3;
mad.lo.s32 %r134, %r202, %r5, %r125;
shl.b32 %r135, %r120, 1;
shl.b32 %r136, %r11, 1;
mul.lo.s32 %r137, %r202, %r3;
mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r139, %r127, %r764, %r126;
mad.lo.s32 %r571, %r136, %r764, %r135;
mad.lo.s32 %r766, %r4, %r571, %r134;
mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r765, %r5;
mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r139, %r202;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r765, %r11;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
add.s32 %r766, %r766, %r137;
add.s32 %r765, %r765, %r3;
add.s32 %r767, %r767, 1;
setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r578, %f762;
mov.u32 %r579, 31;
mov.u32 %r580, 16;
mov.u32 %r581, -1;
shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
mov.b32 %r583, %f597;
mov.u32 %r584, 8;
shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
mov.b32 %r586, %f599;
mov.u32 %r587, 4;
shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
mov.b32 %r589, %f601;
mov.u32 %r590, 2;
shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
mov.b32 %r592, %f603;
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r595, %f763;
mov.u32 %r596, 31;
mov.u32 %r597, 16;
mov.u32 %r598, -1;
shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
mov.b32 %r600, %f607;
mov.u32 %r601, 8;
shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
mov.b32 %r603, %f609;
mov.u32 %r604, 4;
shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
mov.b32 %r606, %f611;
mov.u32 %r607, 2;
shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
mov.b32 %r609, %f613;
mov.u32 %r610, 1;
shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r612, %f761;
mov.u32 %r613, 31;
mov.u32 %r614, 16;
mov.u32 %r615, -1;
shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
mov.b32 %r617, %f618;
mov.u32 %r618, 8;
shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
mov.b32 %r620, %f620;
mov.u32 %r621, 4;
shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
mov.b32 %r623, %f622;
mov.u32 %r624, 2;
shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
mov.b32 %r626, %f624;
mov.u32 %r627, 1;
shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r629, %f765;
mov.u32 %r630, 31;
mov.u32 %r631, 16;
mov.u32 %r632, -1;
shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
mov.b32 %r634, %f628;
mov.u32 %r635, 8;
shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
mov.b32 %r637, %f630;
mov.u32 %r638, 4;
shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
mov.b32 %r640, %f632;
mov.u32 %r641, 2;
shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
mov.b32 %r643, %f634;
mov.u32 %r644, 1;
shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r151, %r127, %r764;
add.s32 %r646, %r126, %r151;
setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
add.s32 %r647, %r128, %r151;
mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r764, %r764, 1;
setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,166 +32,166 @@
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
- .reg .b32 %r<779>;
+ .reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r202, %r203}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r212, %r213}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r216, %r217}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r238, %r203, 7;
- shr.s32 %r239, %r238, 31;
- shr.u32 %r240, %r239, 29;
- add.s32 %r241, %r238, %r240;
- shr.s32 %r2, %r241, 3;
+ add.s32 %r237, %r202, 7;
+ shr.s32 %r238, %r237, 31;
+ shr.u32 %r239, %r238, 29;
+ add.s32 %r240, %r237, %r239;
+ shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
- mov.u32 %r242, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
+ mov.u32 %r241, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r243, [%rd43], %r5;
+ atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r244, %r4, %r2;
- shl.b32 %r245, %r244, 4;
- or.b32 %r246, %r245, 15;
- and.b32 %r7, %r246, -16;
- add.s32 %r247, %r246, %r7;
- and.b32 %r248, %r247, -16;
- cvt.s64.s32 %rd1, %r248;
- max.s32 %r249, %r2, %r3;
- add.s32 %r250, %r249, 31;
- shr.s32 %r251, %r250, 31;
- shr.u32 %r252, %r251, 27;
- add.s32 %r253, %r250, %r252;
- shr.u32 %r254, %r253, 5;
- mul.lo.s32 %r255, %r4, %r254;
- shl.b32 %r256, %r255, 7;
- cvt.u64.u32 %rd2, %r256;
+ mul.lo.s32 %r243, %r4, %r2;
+ shl.b32 %r244, %r243, 4;
+ or.b32 %r245, %r244, 15;
+ and.b32 %r7, %r245, -16;
+ add.s32 %r246, %r245, %r7;
+ and.b32 %r247, %r246, -16;
+ cvt.s64.s32 %rd1, %r247;
+ max.s32 %r248, %r2, %r3;
+ add.s32 %r249, %r248, 31;
+ shr.s32 %r250, %r249, 31;
+ shr.u32 %r251, %r250, 27;
+ add.s32 %r252, %r249, %r251;
+ shr.u32 %r253, %r252, 5;
+ mul.lo.s32 %r254, %r4, %r253;
+ shl.b32 %r255, %r254, 7;
+ cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r257, %r8, 7;
- setp.lt.s32 %p7, %r257, %r203;
+ or.b32 %r256, %r8, 7;
+ setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
-
-
- shl.b32 %r261, %r5, 4;
- add.s32 %r259, %r258, %r261;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
+
+
+ shl.b32 %r260, %r5, 4;
+ add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
- mov.u32 %r260, 0;
+ mov.u32 %r259, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r260, 0;
- cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r259, 0;
+ cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r735, %r6, 4;
- add.s32 %r262, %r4, 215;
- div.s32 %r263, %r262, %r4;
+ shl.b32 %r729, %r6, 4;
+ add.s32 %r261, %r4, 215;
+ div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r264, %r11, %r263;
- add.s32 %r265, %r264, -1;
- div.s32 %r12, %r265, %r11;
+ add.s32 %r263, %r11, %r262;
+ add.s32 %r264, %r263, -1;
+ div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r203;
+ cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
- mov.u32 %r267, %ctaid.y;
- mul.lo.s32 %r268, %r12, %r4;
- mul.lo.s32 %r13, %r268, %r267;
- shl.b32 %r269, %r9, 1;
- shl.b32 %r270, %r5, 4;
- mad.lo.s32 %r14, %r269, %r203, %r270;
- mul.lo.s32 %r271, %r203, %r9;
- cvt.s64.s32 %rd53, %r271;
+ mov.u32 %r266, %ctaid.y;
+ mul.lo.s32 %r267, %r12, %r4;
+ mul.lo.s32 %r13, %r267, %r266;
+ mad.lo.s32 %r268, %r2, %r9, %r5;
+ shl.b32 %r14, %r268, 4;
+ mul.lo.s32 %r269, %r202, %r9;
+ cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r272, %r13, %r203;
- cvt.s64.s32 %rd6, %r272;
+ mul.lo.s32 %r270, %r13, %r202;
+ cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- add.s32 %r15, %r271, %r8;
+ shl.b32 %r271, %r9, 3;
+ mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 2;
+ mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r275, %r274, %r16;
- shr.u32 %r17, %r5, 5;
- add.s32 %r276, %r275, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r275, %r274, %r15;
+ shr.u32 %r16, %r5, 5;
+ add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
- and.b32 %r18, %r5, 31;
- add.s32 %r277, %r275, %r18;
+ and.b32 %r17, %r5, 31;
+ add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
- mov.u32 %r734, 0;
+ mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
- add.s32 %r281, %r14, %r280;
+ add.s32 %r281, %r280, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
- add.s32 %r284, %r14, %r283;
+ add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
@@ -207,29 +207,29 @@
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r734, %r4;
- add.s32 %r278, %r23, %r9;
- add.s32 %r24, %r278, %r13;
- setp.gt.s32 %p13, %r24, 215;
+ mul.lo.s32 %r22, %r728, %r4;
+ add.s32 %r278, %r22, %r9;
+ add.s32 %r23, %r278, %r13;
+ setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
- mul.lo.s32 %r279, %r24, %r212;
+ mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p14, %r24, 216;
+ setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
- mul.lo.s32 %r286, %r23, %r203;
+ mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
@@ -255,11 +255,11 @@
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
- mul.lo.s32 %r287, %r24, %r216;
+ mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
@@ -478,23 +478,23 @@
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
- shl.b32 %r735, %r735, 2;
- bar.sync 0;
- setp.ne.s32 %p23, %r18, 0;
+ shl.b32 %r729, %r729, 2;
+ bar.sync 0;
+ setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
- setp.ne.s32 %p24, %r17, 0;
+ setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
- setp.ge.u32 %p25, %r18, %r16;
+ setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
@@ -526,11 +526,11 @@
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
- setp.ne.s32 %p242, %r18, 0;
+ setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
@@ -556,23 +556,23 @@
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
- setp.eq.s32 %p37, %r18, 0;
+ setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
- setp.ne.s32 %p240, %r17, 0;
+ setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
- setp.ge.u32 %p39, %r18, %r16;
+ setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
@@ -615,11 +615,11 @@
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
- setp.eq.s32 %p241, %r18, 0;
+ setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
@@ -633,11 +633,10 @@
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
- mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
@@ -846,13 +845,12 @@
{ cvt.rn.f16.f32 %rs124, %f401;}
mov.b32 %r391, {%rs124, %rs128};
- add.s32 %r416, %r13, %r732;
- mad.lo.s32 %r417, %r416, %r203, %r15;
- mul.wide.s32 %rd76, %r417, 2;
+ mad.lo.s32 %r416, %r23, %r202, %r8;
+ mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
bra.uni $L__BB0_35;
@@ -862,12 +860,12 @@
{ cvt.rn.f16.f32 %rs61, %f337;}
$L__BB0_35:
- add.s32 %r734, %r734, 1;
- setp.lt.s32 %p49, %r734, %r12;
+ add.s32 %r728, %r728, 1;
+ setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
@@ -886,68 +884,68 @@
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
- mov.u32 %r418, %tid.z;
- mad.lo.s32 %r46, %r418, %r4, %r9;
- mad.lo.s32 %r47, %r46, %r3, %r5;
- mul.wide.u32 %rd77, %r47, 4;
+ mov.u32 %r417, %tid.z;
+ mad.lo.s32 %r45, %r417, %r4, %r9;
+ mad.lo.s32 %r46, %r45, %r3, %r5;
+ mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
- clz.b32 %r419, %r4;
- mov.u32 %r420, 31;
- sub.s32 %r48, %r420, %r419;
- mov.u32 %r421, 1;
- shl.b32 %r766, %r421, %r48;
- setp.lt.u32 %p50, %r9, %r766;
- add.s32 %r422, %r766, %r9;
- setp.lt.u32 %p51, %r422, %r4;
+ clz.b32 %r418, %r4;
+ mov.u32 %r419, 31;
+ sub.s32 %r47, %r419, %r418;
+ mov.u32 %r420, 1;
+ shl.b32 %r760, %r420, %r47;
+ setp.lt.u32 %p50, %r9, %r760;
+ add.s32 %r421, %r760, %r9;
+ setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
- shl.b32 %r423, %r3, %r48;
- add.s32 %r424, %r47, %r423;
- mul.wide.s32 %rd79, %r424, 4;
+ shl.b32 %r422, %r3, %r47;
+ add.s32 %r423, %r46, %r422;
+ mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
- setp.lt.s32 %p53, %r766, 4;
+ setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
- mov.u32 %r736, %r766;
+ mov.u32 %r730, %r760;
$L__BB0_40:
- shr.u32 %r51, %r736, 1;
- setp.ge.u32 %p54, %r9, %r51;
+ shr.u32 %r50, %r730, 1;
+ setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
- mad.lo.s32 %r425, %r51, %r3, %r47;
- mul.wide.s32 %rd82, %r425, 4;
+ mad.lo.s32 %r424, %r50, %r3, %r46;
+ mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
- setp.gt.u32 %p55, %r736, 7;
- mov.u32 %r736, %r51;
+ setp.gt.u32 %p55, %r730, 7;
+ mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
- mov.u32 %r737, 0;
- add.s32 %r427, %r47, %r3;
- mul.wide.u32 %rd85, %r427, 4;
+ mov.u32 %r731, 0;
+ add.s32 %r426, %r46, %r3;
+ mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
@@ -956,54 +954,54 @@
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
- mov.b32 %r737, %f743;
+ mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
- shl.b32 %r428, %r3, %r48;
- add.s32 %r429, %r47, %r428;
- mul.wide.s32 %rd87, %r429, 4;
+ shl.b32 %r427, %r3, %r47;
+ add.s32 %r428, %r46, %r427;
+ mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
- mov.u32 %r738, %r766;
+ mov.u32 %r732, %r760;
$L__BB0_51:
- shr.u32 %r55, %r738, 1;
- setp.ge.u32 %p60, %r9, %r55;
+ shr.u32 %r54, %r732, 1;
+ setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
- mad.lo.s32 %r430, %r55, %r3, %r47;
- mul.wide.s32 %rd90, %r430, 4;
+ mad.lo.s32 %r429, %r54, %r3, %r46;
+ mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
- setp.gt.u32 %p61, %r738, 7;
- mov.u32 %r738, %r55;
+ setp.gt.u32 %p61, %r732, 7;
+ mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
- mov.u32 %r739, 0;
+ mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@@ -1011,54 +1009,54 @@
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
- mov.b32 %r739, %f744;
+ mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
- shl.b32 %r432, %r3, %r48;
- add.s32 %r433, %r47, %r432;
- mul.wide.s32 %rd93, %r433, 4;
+ shl.b32 %r431, %r3, %r47;
+ add.s32 %r432, %r46, %r431;
+ mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
- mov.u32 %r740, %r766;
+ mov.u32 %r734, %r760;
$L__BB0_62:
- shr.u32 %r59, %r740, 1;
- setp.ge.u32 %p66, %r9, %r59;
+ shr.u32 %r58, %r734, 1;
+ setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
- mad.lo.s32 %r434, %r59, %r3, %r47;
- mul.wide.s32 %rd96, %r434, 4;
+ mad.lo.s32 %r433, %r58, %r3, %r46;
+ mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
- setp.gt.u32 %p67, %r740, 7;
- mov.u32 %r740, %r59;
+ setp.gt.u32 %p67, %r734, 7;
+ mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
- mov.u32 %r741, 0;
+ mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@@ -1066,54 +1064,54 @@
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
- mov.b32 %r741, %f745;
+ mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
- shl.b32 %r436, %r3, %r48;
- add.s32 %r437, %r47, %r436;
- mul.wide.s32 %rd99, %r437, 4;
+ shl.b32 %r435, %r3, %r47;
+ add.s32 %r436, %r46, %r435;
+ mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
- mov.u32 %r742, %r766;
+ mov.u32 %r736, %r760;
$L__BB0_73:
- shr.u32 %r63, %r742, 1;
- setp.ge.u32 %p72, %r9, %r63;
+ shr.u32 %r62, %r736, 1;
+ setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
- mad.lo.s32 %r438, %r63, %r3, %r47;
- mul.wide.s32 %rd102, %r438, 4;
+ mad.lo.s32 %r437, %r62, %r3, %r46;
+ mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
- setp.gt.u32 %p73, %r742, 7;
- mov.u32 %r742, %r63;
+ setp.gt.u32 %p73, %r736, 7;
+ mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
- mov.u32 %r743, 0;
+ mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@@ -1121,54 +1119,54 @@
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
- mov.b32 %r743, %f746;
+ mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
- shl.b32 %r440, %r3, %r48;
- add.s32 %r441, %r47, %r440;
- mul.wide.s32 %rd105, %r441, 4;
+ shl.b32 %r439, %r3, %r47;
+ add.s32 %r440, %r46, %r439;
+ mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
- mov.u32 %r744, %r766;
+ mov.u32 %r738, %r760;
$L__BB0_84:
- shr.u32 %r67, %r744, 1;
- setp.ge.u32 %p78, %r9, %r67;
+ shr.u32 %r66, %r738, 1;
+ setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
- mad.lo.s32 %r442, %r67, %r3, %r47;
- mul.wide.s32 %rd108, %r442, 4;
+ mad.lo.s32 %r441, %r66, %r3, %r46;
+ mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
- setp.gt.u32 %p79, %r744, 7;
- mov.u32 %r744, %r67;
+ setp.gt.u32 %p79, %r738, 7;
+ mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
- mov.u32 %r745, 0;
+ mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@@ -1176,54 +1174,54 @@
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
- mov.b32 %r745, %f747;
+ mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
- shl.b32 %r444, %r3, %r48;
- add.s32 %r445, %r47, %r444;
- mul.wide.s32 %rd111, %r445, 4;
+ shl.b32 %r443, %r3, %r47;
+ add.s32 %r444, %r46, %r443;
+ mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
- mov.u32 %r746, %r766;
+ mov.u32 %r740, %r760;
$L__BB0_95:
- shr.u32 %r71, %r746, 1;
- setp.ge.u32 %p84, %r9, %r71;
+ shr.u32 %r70, %r740, 1;
+ setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
- mad.lo.s32 %r446, %r71, %r3, %r47;
- mul.wide.s32 %rd114, %r446, 4;
+ mad.lo.s32 %r445, %r70, %r3, %r46;
+ mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
- setp.gt.u32 %p85, %r746, 7;
- mov.u32 %r746, %r71;
+ setp.gt.u32 %p85, %r740, 7;
+ mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
- mov.u32 %r747, 0;
+ mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@@ -1231,54 +1229,54 @@
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
- mov.b32 %r747, %f748;
+ mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
- shl.b32 %r448, %r3, %r48;
- add.s32 %r449, %r47, %r448;
- mul.wide.s32 %rd117, %r449, 4;
+ shl.b32 %r447, %r3, %r47;
+ add.s32 %r448, %r46, %r447;
+ mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
- mov.u32 %r748, %r766;
+ mov.u32 %r742, %r760;
$L__BB0_106:
- shr.u32 %r75, %r748, 1;
- setp.ge.u32 %p90, %r9, %r75;
+ shr.u32 %r74, %r742, 1;
+ setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
- mad.lo.s32 %r450, %r75, %r3, %r47;
- mul.wide.s32 %rd120, %r450, 4;
+ mad.lo.s32 %r449, %r74, %r3, %r46;
+ mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
- setp.gt.u32 %p91, %r748, 7;
- mov.u32 %r748, %r75;
+ setp.gt.u32 %p91, %r742, 7;
+ mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
- mov.u32 %r749, 0;
+ mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@@ -1286,54 +1284,54 @@
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
- mov.b32 %r749, %f749;
+ mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
- shl.b32 %r452, %r3, %r48;
- add.s32 %r453, %r47, %r452;
- mul.wide.s32 %rd123, %r453, 4;
+ shl.b32 %r451, %r3, %r47;
+ add.s32 %r452, %r46, %r451;
+ mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
- mov.u32 %r750, %r766;
+ mov.u32 %r744, %r760;
$L__BB0_117:
- shr.u32 %r79, %r750, 1;
- setp.ge.u32 %p96, %r9, %r79;
+ shr.u32 %r78, %r744, 1;
+ setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
- mad.lo.s32 %r454, %r79, %r3, %r47;
- mul.wide.s32 %rd126, %r454, 4;
+ mad.lo.s32 %r453, %r78, %r3, %r46;
+ mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
- setp.gt.u32 %p97, %r750, 7;
- mov.u32 %r750, %r79;
+ setp.gt.u32 %p97, %r744, 7;
+ mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
- mov.u32 %r751, 0;
+ mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@@ -1341,55 +1339,55 @@
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
- mov.b32 %r751, %f750;
+ mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
- shl.b32 %r82, %r735, 4;
+ shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
- shl.b32 %r456, %r3, %r48;
- add.s32 %r457, %r47, %r456;
- mul.wide.s32 %rd129, %r457, 4;
+ shl.b32 %r455, %r3, %r47;
+ add.s32 %r456, %r46, %r455;
+ mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
- mov.u32 %r752, %r766;
+ mov.u32 %r746, %r760;
$L__BB0_128:
- shr.u32 %r84, %r752, 1;
- setp.ge.u32 %p102, %r9, %r84;
+ shr.u32 %r83, %r746, 1;
+ setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
- mad.lo.s32 %r458, %r84, %r3, %r47;
- mul.wide.s32 %rd132, %r458, 4;
+ mad.lo.s32 %r457, %r83, %r3, %r46;
+ mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
- setp.gt.u32 %p103, %r752, 7;
- mov.u32 %r752, %r84;
+ setp.gt.u32 %p103, %r746, 7;
+ mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
- mov.u32 %r753, 0;
+ mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@@ -1397,54 +1395,54 @@
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
- mov.b32 %r753, %f751;
+ mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
- shl.b32 %r460, %r3, %r48;
- add.s32 %r461, %r47, %r460;
- mul.wide.s32 %rd135, %r461, 4;
+ shl.b32 %r459, %r3, %r47;
+ add.s32 %r460, %r46, %r459;
+ mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
- mov.u32 %r754, %r766;
+ mov.u32 %r748, %r760;
$L__BB0_139:
- shr.u32 %r88, %r754, 1;
- setp.ge.u32 %p108, %r9, %r88;
+ shr.u32 %r87, %r748, 1;
+ setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
- mad.lo.s32 %r462, %r88, %r3, %r47;
- mul.wide.s32 %rd138, %r462, 4;
+ mad.lo.s32 %r461, %r87, %r3, %r46;
+ mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
- setp.gt.u32 %p109, %r754, 7;
- mov.u32 %r754, %r88;
+ setp.gt.u32 %p109, %r748, 7;
+ mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
- mov.u32 %r755, 0;
+ mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@@ -1452,54 +1450,54 @@
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
- mov.b32 %r755, %f752;
+ mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
- shl.b32 %r464, %r3, %r48;
- add.s32 %r465, %r47, %r464;
- mul.wide.s32 %rd141, %r465, 4;
+ shl.b32 %r463, %r3, %r47;
+ add.s32 %r464, %r46, %r463;
+ mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
- mov.u32 %r756, %r766;
+ mov.u32 %r750, %r760;
$L__BB0_150:
- shr.u32 %r92, %r756, 1;
- setp.ge.u32 %p114, %r9, %r92;
+ shr.u32 %r91, %r750, 1;
+ setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
- mad.lo.s32 %r466, %r92, %r3, %r47;
- mul.wide.s32 %rd144, %r466, 4;
+ mad.lo.s32 %r465, %r91, %r3, %r46;
+ mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
- setp.gt.u32 %p115, %r756, 7;
- mov.u32 %r756, %r92;
+ setp.gt.u32 %p115, %r750, 7;
+ mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
- mov.u32 %r757, 0;
+ mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@@ -1507,54 +1505,54 @@
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
- mov.b32 %r757, %f753;
+ mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
- shl.b32 %r468, %r3, %r48;
- add.s32 %r469, %r47, %r468;
- mul.wide.s32 %rd147, %r469, 4;
+ shl.b32 %r467, %r3, %r47;
+ add.s32 %r468, %r46, %r467;
+ mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
- mov.u32 %r758, %r766;
+ mov.u32 %r752, %r760;
$L__BB0_161:
- shr.u32 %r96, %r758, 1;
- setp.ge.u32 %p120, %r9, %r96;
+ shr.u32 %r95, %r752, 1;
+ setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
- mad.lo.s32 %r470, %r96, %r3, %r47;
- mul.wide.s32 %rd150, %r470, 4;
+ mad.lo.s32 %r469, %r95, %r3, %r46;
+ mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
- setp.gt.u32 %p121, %r758, 7;
- mov.u32 %r758, %r96;
+ setp.gt.u32 %p121, %r752, 7;
+ mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
- mov.u32 %r759, 0;
+ mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@@ -1562,54 +1560,54 @@
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
- mov.b32 %r759, %f754;
+ mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
- shl.b32 %r472, %r3, %r48;
- add.s32 %r473, %r47, %r472;
- mul.wide.s32 %rd153, %r473, 4;
+ shl.b32 %r471, %r3, %r47;
+ add.s32 %r472, %r46, %r471;
+ mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
- mov.u32 %r760, %r766;
+ mov.u32 %r754, %r760;
$L__BB0_172:
- shr.u32 %r100, %r760, 1;
- setp.ge.u32 %p126, %r9, %r100;
+ shr.u32 %r99, %r754, 1;
+ setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
- mad.lo.s32 %r474, %r100, %r3, %r47;
- mul.wide.s32 %rd156, %r474, 4;
+ mad.lo.s32 %r473, %r99, %r3, %r46;
+ mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
- setp.gt.u32 %p127, %r760, 7;
- mov.u32 %r760, %r100;
+ setp.gt.u32 %p127, %r754, 7;
+ mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
- mov.u32 %r761, 0;
+ mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@@ -1617,54 +1615,54 @@
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
- mov.b32 %r761, %f755;
+ mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
- shl.b32 %r476, %r3, %r48;
- add.s32 %r477, %r47, %r476;
- mul.wide.s32 %rd159, %r477, 4;
+ shl.b32 %r475, %r3, %r47;
+ add.s32 %r476, %r46, %r475;
+ mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
- mov.u32 %r762, %r766;
+ mov.u32 %r756, %r760;
$L__BB0_183:
- shr.u32 %r104, %r762, 1;
- setp.ge.u32 %p132, %r9, %r104;
+ shr.u32 %r103, %r756, 1;
+ setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
- mad.lo.s32 %r478, %r104, %r3, %r47;
- mul.wide.s32 %rd162, %r478, 4;
+ mad.lo.s32 %r477, %r103, %r3, %r46;
+ mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
- setp.gt.u32 %p133, %r762, 7;
- mov.u32 %r762, %r104;
+ setp.gt.u32 %p133, %r756, 7;
+ mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
- mov.u32 %r763, 0;
+ mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@@ -1672,54 +1670,54 @@
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
- mov.b32 %r763, %f756;
+ mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
- shl.b32 %r480, %r3, %r48;
- add.s32 %r481, %r47, %r480;
- mul.wide.s32 %rd165, %r481, 4;
+ shl.b32 %r479, %r3, %r47;
+ add.s32 %r480, %r46, %r479;
+ mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
- mov.u32 %r764, %r766;
+ mov.u32 %r758, %r760;
$L__BB0_194:
- shr.u32 %r108, %r764, 1;
- setp.ge.u32 %p138, %r9, %r108;
+ shr.u32 %r107, %r758, 1;
+ setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
- mad.lo.s32 %r482, %r108, %r3, %r47;
- mul.wide.s32 %rd168, %r482, 4;
+ mad.lo.s32 %r481, %r107, %r3, %r46;
+ mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
- setp.gt.u32 %p139, %r764, 7;
- mov.u32 %r764, %r108;
+ setp.gt.u32 %p139, %r758, 7;
+ mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
- mov.u32 %r765, 0;
+ mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@@ -1727,21 +1725,21 @@
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
- mov.b32 %r765, %f757;
+ mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
- shl.b32 %r484, %r3, %r48;
- add.s32 %r485, %r47, %r484;
- mul.wide.s32 %rd171, %r485, 4;
+ shl.b32 %r483, %r3, %r47;
+ add.s32 %r484, %r46, %r483;
+ mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
@@ -1749,30 +1747,30 @@
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
- shr.u32 %r112, %r766, 1;
- setp.ge.u32 %p144, %r9, %r112;
+ shr.u32 %r111, %r760, 1;
+ setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
- mad.lo.s32 %r486, %r112, %r3, %r47;
- mul.wide.s32 %rd174, %r486, 4;
+ mad.lo.s32 %r485, %r111, %r3, %r46;
+ mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
- setp.gt.u32 %p145, %r766, 7;
- mov.u32 %r766, %r112;
+ setp.gt.u32 %p145, %r760, 7;
+ mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
- mov.u32 %r767, 0;
+ mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@@ -1780,420 +1778,416 @@
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
- mov.b32 %r767, %f758;
+ mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
- shl.b32 %r731, %r5, 3;
- mov.u32 %r512, %ctaid.y;
- mad.lo.s32 %r513, %r203, %r512, %r731;
- add.s32 %r514, %r513, %r82;
- mul.wide.s32 %rd183, %r514, 4;
+ mov.u32 %r511, %ctaid.y;
+ mad.lo.s32 %r512, %r202, %r511, %r8;
+ add.s32 %r513, %r512, %r81;
+ mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
- st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
-
- add.s32 %r515, %r514, 4;
- mul.wide.s32 %rd184, %r515, 4;
+ st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
+
+ add.s32 %r514, %r513, 4;
+ mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
- st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
bra.uni $L__BB0_218;
$L__BB0_212:
- shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
- add.s32 %r488, %r728, 3;
- sub.s32 %r115, %r488, %r203;
- mov.u32 %r489, %ctaid.y;
- mad.lo.s32 %r116, %r203, %r489, %r728;
- neg.s32 %r490, %r82;
- setp.ge.s32 %p151, %r115, %r490;
+ add.s32 %r487, %r8, 3;
+ sub.s32 %r114, %r487, %r202;
+ mov.u32 %r488, %ctaid.y;
+ mad.lo.s32 %r115, %r202, %r488, %r8;
+ neg.s32 %r489, %r81;
+ setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
- add.s32 %r495, %r116, %r82;
- mul.wide.s32 %rd178, %r495, 4;
+ add.s32 %r494, %r115, %r81;
+ mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
- st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
+ st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
$L__BB0_214:
- mov.u32 %r496, -4;
- sub.s32 %r497, %r496, %r82;
- setp.ge.s32 %p153, %r115, %r497;
+ mov.u32 %r495, -4;
+ sub.s32 %r496, %r495, %r81;
+ setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
- add.s32 %r502, %r116, %r82;
- add.s32 %r503, %r502, 4;
- mul.wide.s32 %rd180, %r503, 4;
+ add.s32 %r501, %r115, %r81;
+ add.s32 %r502, %r501, 4;
+ mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
- st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
- shl.b32 %r730, %r5, 3;
- shl.b32 %r540, %r735, 5;
- mov.u32 %r541, %ctaid.y;
- mad.lo.s32 %r542, %r203, %r541, %r730;
- add.s32 %r543, %r542, %r540;
- mul.wide.s32 %rd191, %r543, 4;
+ shl.b32 %r539, %r729, 5;
+ mov.u32 %r540, %ctaid.y;
+ mad.lo.s32 %r541, %r202, %r540, %r8;
+ add.s32 %r542, %r541, %r539;
+ mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
- st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
-
- add.s32 %r544, %r543, 4;
- mul.wide.s32 %rd192, %r544, 4;
+ st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
+
+ add.s32 %r543, %r542, 4;
+ mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
- st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
bra.uni $L__BB0_227;
$L__BB0_219:
- shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
- add.s32 %r516, %r729, 3;
- sub.s32 %r117, %r516, %r203;
- mov.u32 %r517, %ctaid.y;
- mad.lo.s32 %r118, %r203, %r517, %r729;
+ add.s32 %r515, %r8, 3;
+ sub.s32 %r116, %r515, %r202;
+ mov.u32 %r516, %ctaid.y;
+ mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
- shl.b32 %r119, %r735, 5;
- neg.s32 %r518, %r119;
- setp.ge.s32 %p160, %r117, %r518;
+ shl.b32 %r118, %r729, 5;
+ neg.s32 %r517, %r118;
+ setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
- add.s32 %r523, %r118, %r119;
- mul.wide.s32 %rd186, %r523, 4;
+ add.s32 %r522, %r117, %r118;
+ mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
+ st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
$L__BB0_222:
@%p159 bra $L__BB0_227;
- shl.b32 %r120, %r735, 5;
- mov.u32 %r524, -4;
- sub.s32 %r525, %r524, %r120;
- setp.ge.s32 %p162, %r117, %r525;
+ shl.b32 %r119, %r729, 5;
+ mov.u32 %r523, -4;
+ sub.s32 %r524, %r523, %r119;
+ setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
- add.s32 %r530, %r118, %r120;
- add.s32 %r531, %r530, 4;
- mul.wide.s32 %rd188, %r531, 4;
+ add.s32 %r529, %r117, %r119;
+ add.s32 %r530, %r529, 4;
+ mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
- st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
$L__BB0_227:
- mov.u32 %r121, %ctaid.y;
+ mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r545, %r5, %r9;
- or.b32 %r547, %r545, %r418;
- setp.ne.s32 %p164, %r547, 0;
+ or.b32 %r544, %r5, %r9;
+ or.b32 %r546, %r544, %r417;
+ setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
- mov.u32 %r548, %ctaid.x;
- mov.u32 %r549, %ctaid.z;
- mov.u32 %r550, %nctaid.x;
- mad.lo.s32 %r551, %r549, %r550, %r548;
- mul.wide.s32 %rd194, %r551, 8;
+ mov.u32 %r547, %ctaid.x;
+ mov.u32 %r548, %ctaid.z;
+ mov.u32 %r549, %nctaid.x;
+ mad.lo.s32 %r550, %r548, %r549, %r547;
+ mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
- add.s32 %r552, %r11, -1;
- setp.eq.s32 %p165, %r121, %r552;
+ add.s32 %r551, %r11, -1;
+ setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
- mov.u32 %r768, 8;
+ mov.u32 %r762, 8;
$L__BB0_230:
- nanosleep.u32 %r768;
-
- setp.lt.u32 %p167, %r768, 256;
- selp.u32 %r555, 1, 0, %p167;
- shl.b32 %r768, %r768, %r555;
+ nanosleep.u32 %r762;
+
+ setp.lt.u32 %p167, %r762, 256;
+ selp.u32 %r554, 1, 0, %p167;
+ shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
- add.s32 %r557, %r203, 1;
- shr.u32 %r558, %r557, 31;
- add.s32 %r559, %r557, %r558;
- shr.s32 %r560, %r559, 1;
- add.s32 %r561, %r4, %r560;
- add.s32 %r562, %r561, -1;
- div.s32 %r563, %r562, %r4;
- add.s32 %r564, %r11, -1;
- add.s32 %r565, %r564, %r563;
- div.s32 %r124, %r565, %r11;
- add.s32 %r125, %r564, %r3;
- shl.b32 %r126, %r9, 1;
- shl.b32 %r566, %r4, 1;
- mad.lo.s32 %r129, %r566, %r121, %r126;
- or.b32 %r127, %r129, 1;
- mul.lo.s32 %r128, %r566, %r11;
- shr.u32 %r130, %r3, 5;
- mul.lo.s32 %r567, %r46, %r130;
- shr.u32 %r131, %r5, 5;
- add.s32 %r568, %r567, %r131;
- mul.wide.u32 %rd203, %r568, 4;
+ add.s32 %r556, %r202, 1;
+ shr.u32 %r557, %r556, 31;
+ add.s32 %r558, %r556, %r557;
+ shr.s32 %r559, %r558, 1;
+ add.s32 %r560, %r4, %r559;
+ add.s32 %r561, %r560, -1;
+ div.s32 %r562, %r561, %r4;
+ add.s32 %r563, %r11, -1;
+ add.s32 %r564, %r563, %r562;
+ div.s32 %r123, %r564, %r11;
+ add.s32 %r124, %r563, %r3;
+ shl.b32 %r125, %r9, 1;
+ shl.b32 %r565, %r4, 1;
+ mad.lo.s32 %r128, %r565, %r120, %r125;
+ or.b32 %r126, %r128, 1;
+ mul.lo.s32 %r127, %r565, %r11;
+ shr.u32 %r129, %r3, 5;
+ mul.lo.s32 %r566, %r45, %r129;
+ shr.u32 %r130, %r5, 5;
+ add.s32 %r567, %r566, %r130;
+ mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
- and.b32 %r132, %r5, 31;
- add.s32 %r569, %r567, %r132;
- mul.wide.u32 %rd205, %r569, 4;
+ and.b32 %r131, %r5, 31;
+ add.s32 %r568, %r566, %r131;
+ mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
- mov.u32 %r769, 0;
+ mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
- add.s32 %r769, %r769, 1;
+ add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
- setp.lt.s32 %p169, %r769, %r124;
+ setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
- div.s32 %r154, %r125, %r3;
- setp.lt.s32 %p206, %r154, 1;
+ div.s32 %r153, %r124, %r3;
+ setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
- mul.lo.s32 %r650, %r128, %r769;
- add.s32 %r155, %r127, %r650;
- add.s32 %r156, %r129, %r650;
- mov.u32 %r649, 0;
+ mul.lo.s32 %r649, %r127, %r763;
+ add.s32 %r154, %r126, %r649;
+ add.s32 %r155, %r128, %r649;
+ mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
- mov.u32 %r776, %r649;
+ mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
- setp.ge.s32 %p207, %r155, %r203;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ setp.ge.s32 %p207, %r154, %r202;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
- mad.lo.s32 %r158, %r776, %r3, %r5;
- setp.ge.s32 %p208, %r158, %r11;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ mad.lo.s32 %r157, %r770, %r3, %r5;
+ setp.ge.s32 %p208, %r157, %r11;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
- mad.lo.s32 %r657, %r158, %r203, %r156;
- mul.wide.s32 %rd211, %r657, 4;
+ mad.lo.s32 %r656, %r157, %r202, %r155;
+ mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
- ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
+ ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
$L__BB0_263:
- mov.b32 %f642, %r778;
+ mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
- mov.b32 %f643, %r777;
+ mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
- add.s32 %r776, %r776, 1;
- setp.lt.s32 %p209, %r776, %r154;
+ add.s32 %r770, %r770, 1;
+ setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
- mov.b32 %r658, %f770;
- mov.u32 %r659, 31;
- mov.u32 %r660, 16;
- mov.u32 %r661, -1;
- shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
- mov.b32 %f644, %r662;
+ mov.b32 %r657, %f770;
+ mov.u32 %r658, 31;
+ mov.u32 %r659, 16;
+ mov.u32 %r660, -1;
+ shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
+ mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
- mov.b32 %r663, %f645;
- mov.u32 %r664, 8;
- shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
- mov.b32 %f646, %r665;
+ mov.b32 %r662, %f645;
+ mov.u32 %r663, 8;
+ shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
+ mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
- mov.b32 %r666, %f647;
- mov.u32 %r667, 4;
- shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
- mov.b32 %f648, %r668;
+ mov.b32 %r665, %f647;
+ mov.u32 %r666, 4;
+ shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
+ mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
- mov.b32 %r669, %f649;
- mov.u32 %r670, 2;
- shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
- mov.b32 %f650, %r671;
+ mov.b32 %r668, %f649;
+ mov.u32 %r669, 2;
+ shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
+ mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
- mov.b32 %r672, %f651;
- mov.u32 %r673, 1;
- shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
- mov.b32 %f652, %r674;
+ mov.b32 %r671, %f651;
+ mov.u32 %r672, 1;
+ shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
+ mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
- setp.ne.s32 %p215, %r132, 0;
+ setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
- setp.ne.s32 %p216, %r131, 0;
+ setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
- setp.ge.u32 %p217, %r132, %r130;
+ setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
- mov.b32 %r675, %f771;
- mov.u32 %r676, 31;
- mov.u32 %r677, 16;
- mov.u32 %r678, -1;
- shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
- mov.b32 %f654, %r679;
+ mov.b32 %r674, %f771;
+ mov.u32 %r675, 31;
+ mov.u32 %r676, 16;
+ mov.u32 %r677, -1;
+ shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
+ mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
- mov.b32 %r680, %f655;
- mov.u32 %r681, 8;
- shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
- mov.b32 %f656, %r682;
+ mov.b32 %r679, %f655;
+ mov.u32 %r680, 8;
+ shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
+ mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
- mov.b32 %r683, %f657;
- mov.u32 %r684, 4;
- shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
- mov.b32 %f658, %r685;
+ mov.b32 %r682, %f657;
+ mov.u32 %r683, 4;
+ shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
+ mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
- mov.b32 %r686, %f659;
- mov.u32 %r687, 2;
- shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
- mov.b32 %f660, %r688;
+ mov.b32 %r685, %f659;
+ mov.u32 %r686, 2;
+ shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
+ mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
- mov.b32 %r689, %f661;
- mov.u32 %r690, 1;
- shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
- mov.b32 %f662, %r691;
+ mov.b32 %r688, %f661;
+ mov.u32 %r689, 1;
+ shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
+ mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
- setp.eq.s32 %p224, %r132, 0;
+ setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
{ cvt.rn.f16.f32 %rs131, %f663;}
- mov.b32 %r692, %f769;
- mov.u32 %r693, 31;
- mov.u32 %r694, 16;
- mov.u32 %r695, -1;
- shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
- mov.b32 %f665, %r696;
+ mov.b32 %r691, %f769;
+ mov.u32 %r692, 31;
+ mov.u32 %r693, 16;
+ mov.u32 %r694, -1;
+ shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
+ mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
- mov.b32 %r697, %f666;
- mov.u32 %r698, 8;
- shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
- mov.b32 %f667, %r699;
+ mov.b32 %r696, %f666;
+ mov.u32 %r697, 8;
+ shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
+ mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
- mov.b32 %r700, %f668;
- mov.u32 %r701, 4;
- shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
- mov.b32 %f669, %r702;
+ mov.b32 %r699, %f668;
+ mov.u32 %r700, 4;
+ shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
+ mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
- mov.b32 %r703, %f670;
- mov.u32 %r704, 2;
- shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
- mov.b32 %f671, %r705;
+ mov.b32 %r702, %f670;
+ mov.u32 %r703, 2;
+ shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
+ mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
- mov.b32 %r706, %f672;
- mov.u32 %r707, 1;
- shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
- mov.b32 %f673, %r708;
+ mov.b32 %r705, %f672;
+ mov.u32 %r706, 1;
+ shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
+ mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
- setp.ge.u32 %p231, %r132, %r130;
+ setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
- mov.b32 %r709, %f773;
- mov.u32 %r710, 31;
- mov.u32 %r711, 16;
- mov.u32 %r712, -1;
- shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
- mov.b32 %f675, %r713;
+ mov.b32 %r708, %f773;
+ mov.u32 %r709, 31;
+ mov.u32 %r710, 16;
+ mov.u32 %r711, -1;
+ shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
+ mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
- mov.b32 %r714, %f676;
- mov.u32 %r715, 8;
- shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
- mov.b32 %f677, %r716;
+ mov.b32 %r713, %f676;
+ mov.u32 %r714, 8;
+ shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
+ mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
- mov.b32 %r717, %f678;
- mov.u32 %r718, 4;
- shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
- mov.b32 %f679, %r719;
+ mov.b32 %r716, %f678;
+ mov.u32 %r717, 4;
+ shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
+ mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
- mov.b32 %r720, %f680;
- mov.u32 %r721, 2;
- shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
- mov.b32 %f681, %r722;
+ mov.b32 %r719, %f680;
+ mov.u32 %r720, 2;
+ shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
+ mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
- mov.b32 %r723, %f682;
- mov.u32 %r724, 1;
- shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
- mov.b32 %f683, %r725;
+ mov.b32 %r722, %f682;
+ mov.u32 %r723, 1;
+ shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
+ mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
@@ -2202,228 +2196,228 @@
{ cvt.rn.f16.f32 %rs132, %f684;}
@%p6 bra $L__BB0_279;
- mul.lo.s32 %r164, %r128, %r769;
- add.s32 %r726, %r127, %r164;
- setp.ge.s32 %p239, %r726, %r203;
+ mul.lo.s32 %r163, %r127, %r763;
+ add.s32 %r725, %r126, %r163;
+ setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
- add.s32 %r727, %r129, %r164;
- mul.wide.s32 %rd212, %r727, 2;
+ add.s32 %r726, %r128, %r163;
+ mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
- setp.lt.s32 %p170, %r124, 1;
+ setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
- div.s32 %r134, %r125, %r3;
- mad.lo.s32 %r135, %r203, %r5, %r126;
- shl.b32 %r136, %r121, 1;
- shl.b32 %r137, %r11, 1;
- mul.lo.s32 %r138, %r203, %r3;
- mov.u32 %r770, 0;
+ div.s32 %r133, %r124, %r3;
+ mad.lo.s32 %r134, %r202, %r5, %r125;
+ shl.b32 %r135, %r120, 1;
+ shl.b32 %r136, %r11, 1;
+ mul.lo.s32 %r137, %r202, %r3;
+ mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
- setp.lt.s32 %p171, %r134, 1;
+ setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
- mad.lo.s32 %r140, %r128, %r770, %r127;
- mad.lo.s32 %r572, %r137, %r770, %r136;
- mad.lo.s32 %r772, %r4, %r572, %r135;
- mov.u32 %r571, 0;
+ mad.lo.s32 %r139, %r127, %r764, %r126;
+ mad.lo.s32 %r571, %r136, %r764, %r135;
+ mad.lo.s32 %r766, %r4, %r571, %r134;
+ mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
- mov.u32 %r771, %r5;
- mov.u32 %r773, %r571;
+ mov.u32 %r765, %r5;
+ mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
- setp.ge.s32 %p172, %r140, %r203;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p172, %r139, %r202;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
- setp.ge.s32 %p173, %r771, %r11;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p173, %r765, %r11;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
- mul.wide.s32 %rd207, %r772, 4;
+ mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
- ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
+ ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
$L__BB0_240:
- mov.b32 %f594, %r775;
+ mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
- mov.b32 %f595, %r774;
+ mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
- add.s32 %r772, %r772, %r138;
- add.s32 %r771, %r771, %r3;
- add.s32 %r773, %r773, 1;
- setp.lt.s32 %p174, %r773, %r134;
+ add.s32 %r766, %r766, %r137;
+ add.s32 %r765, %r765, %r3;
+ add.s32 %r767, %r767, 1;
+ setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
- mov.b32 %r579, %f762;
- mov.u32 %r580, 31;
- mov.u32 %r581, 16;
- mov.u32 %r582, -1;
- shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
- mov.b32 %f596, %r583;
+ mov.b32 %r578, %f762;
+ mov.u32 %r579, 31;
+ mov.u32 %r580, 16;
+ mov.u32 %r581, -1;
+ shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
+ mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
- mov.b32 %r584, %f597;
- mov.u32 %r585, 8;
- shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
- mov.b32 %f598, %r586;
+ mov.b32 %r583, %f597;
+ mov.u32 %r584, 8;
+ shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
+ mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
- mov.b32 %r587, %f599;
- mov.u32 %r588, 4;
- shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
- mov.b32 %f600, %r589;
+ mov.b32 %r586, %f599;
+ mov.u32 %r587, 4;
+ shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
+ mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
- mov.b32 %r590, %f601;
- mov.u32 %r591, 2;
- shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
- mov.b32 %f602, %r592;
+ mov.b32 %r589, %f601;
+ mov.u32 %r590, 2;
+ shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
+ mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
- mov.b32 %r593, %f603;
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
- mov.b32 %f604, %r595;
+ mov.b32 %r592, %f603;
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
+ mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
- setp.ne.s32 %p180, %r132, 0;
+ setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
- setp.ne.s32 %p181, %r131, 0;
+ setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
- setp.ge.u32 %p182, %r132, %r130;
+ setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
- mov.b32 %r596, %f763;
- mov.u32 %r597, 31;
- mov.u32 %r598, 16;
- mov.u32 %r599, -1;
- shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
- mov.b32 %f606, %r600;
+ mov.b32 %r595, %f763;
+ mov.u32 %r596, 31;
+ mov.u32 %r597, 16;
+ mov.u32 %r598, -1;
+ shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
+ mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
- mov.b32 %r601, %f607;
- mov.u32 %r602, 8;
- shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
- mov.b32 %f608, %r603;
+ mov.b32 %r600, %f607;
+ mov.u32 %r601, 8;
+ shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
+ mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
- mov.b32 %r604, %f609;
- mov.u32 %r605, 4;
- shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
- mov.b32 %f610, %r606;
+ mov.b32 %r603, %f609;
+ mov.u32 %r604, 4;
+ shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
+ mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
- mov.b32 %r607, %f611;
- mov.u32 %r608, 2;
- shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
- mov.b32 %f612, %r609;
+ mov.b32 %r606, %f611;
+ mov.u32 %r607, 2;
+ shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
+ mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
- mov.b32 %r610, %f613;
- mov.u32 %r611, 1;
- shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
- mov.b32 %f614, %r612;
+ mov.b32 %r609, %f613;
+ mov.u32 %r610, 1;
+ shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
+ mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
- setp.eq.s32 %p189, %r132, 0;
+ setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
{ cvt.rn.f16.f32 %rs129, %f615;}
- mov.b32 %r613, %f761;
- mov.u32 %r614, 31;
- mov.u32 %r615, 16;
- mov.u32 %r616, -1;
- shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
- mov.b32 %f617, %r617;
+ mov.b32 %r612, %f761;
+ mov.u32 %r613, 31;
+ mov.u32 %r614, 16;
+ mov.u32 %r615, -1;
+ shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
+ mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
- mov.b32 %r618, %f618;
- mov.u32 %r619, 8;
- shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
- mov.b32 %f619, %r620;
+ mov.b32 %r617, %f618;
+ mov.u32 %r618, 8;
+ shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
+ mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
- mov.b32 %r621, %f620;
- mov.u32 %r622, 4;
- shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
- mov.b32 %f621, %r623;
+ mov.b32 %r620, %f620;
+ mov.u32 %r621, 4;
+ shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
+ mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
- mov.b32 %r624, %f622;
- mov.u32 %r625, 2;
- shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
- mov.b32 %f623, %r626;
+ mov.b32 %r623, %f622;
+ mov.u32 %r624, 2;
+ shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
+ mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
- mov.b32 %r627, %f624;
- mov.u32 %r628, 1;
- shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
- mov.b32 %f625, %r629;
+ mov.b32 %r626, %f624;
+ mov.u32 %r627, 1;
+ shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
+ mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
- setp.ge.u32 %p196, %r132, %r130;
+ setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
- mov.b32 %r630, %f765;
- mov.u32 %r631, 31;
- mov.u32 %r632, 16;
- mov.u32 %r633, -1;
- shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
- mov.b32 %f627, %r634;
+ mov.b32 %r629, %f765;
+ mov.u32 %r630, 31;
+ mov.u32 %r631, 16;
+ mov.u32 %r632, -1;
+ shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
+ mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
- mov.b32 %r635, %f628;
- mov.u32 %r636, 8;
- shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
- mov.b32 %f629, %r637;
+ mov.b32 %r634, %f628;
+ mov.u32 %r635, 8;
+ shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
+ mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
- mov.b32 %r638, %f630;
- mov.u32 %r639, 4;
- shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
- mov.b32 %f631, %r640;
+ mov.b32 %r637, %f630;
+ mov.u32 %r638, 4;
+ shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
+ mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
- mov.b32 %r641, %f632;
- mov.u32 %r642, 2;
- shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
- mov.b32 %f633, %r643;
+ mov.b32 %r640, %f632;
+ mov.u32 %r641, 2;
+ shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
+ mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
- mov.b32 %r644, %f634;
- mov.u32 %r645, 1;
- shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
- mov.b32 %f635, %r646;
+ mov.b32 %r643, %f634;
+ mov.u32 %r644, 1;
+ shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
+ mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
@@ -2432,23 +2426,23 @@
{ cvt.rn.f16.f32 %rs130, %f636;}
@%p6 bra $L__BB0_256;
- mul.lo.s32 %r152, %r128, %r770;
- add.s32 %r647, %r127, %r152;
- setp.ge.s32 %p204, %r647, %r203;
+ mul.lo.s32 %r151, %r127, %r764;
+ add.s32 %r646, %r126, %r151;
+ setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
- add.s32 %r648, %r129, %r152;
- mul.wide.s32 %rd208, %r648, 2;
+ add.s32 %r647, %r128, %r151;
+ mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
- add.s32 %r770, %r770, 1;
- setp.lt.s32 %p205, %r770, %r124;
+ add.s32 %r764, %r764, 1;
+ setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
17: CombinedSchedulerTest.LayerNormBackward/dtype___half_batch_216_hidden_1024
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 60→ 56
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__half, 2, 2> T0, Tensor<__half, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__half, 1, 1> T4, Tensor<__half, 1, 1> T5, Tensor<__half, 2, 2> T28, Tensor<__half, 1, 1> T30, Tensor<__half, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__half* T44 = reinterpret_cast<__half*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__half* T41 = reinterpret_cast<__half*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__half* T40 = reinterpret_cast<__half*>(array + smem_offset + 0);
Tensor<__half, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __half2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __half2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __half2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __half2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __half2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2half(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2half(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__half, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2half(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__half, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T53;
T53.set(__half(0));
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T51;
T51.set(__half(0));
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T49;
T49.set(__half(0));
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __half2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__half, 8, 8> T53;
T53.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T51;
T51.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T49;
T49.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__half, 8, 8> T52;
T52.set(__half(0));
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__half, 8, 8> T50;
T50.set(__half(0));
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__half, 8, 8> T48;
T48.set(__half(0));
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__half, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__half, 8, 8> T52;
T52.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__half, 8, 8> T50;
T50.set(__half(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__half, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__half, 8, 8> T48;
T48.set(__half(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__half, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<779>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r202, %r203}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r212, %r213}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r216, %r217}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r238, %r203, 7;
shr.s32 %r239, %r238, 31;
shr.u32 %r240, %r239, 29;
add.s32 %r241, %r238, %r240;
shr.s32 %r2, %r241, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r242, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r243, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r244, %r4, %r2;
shl.b32 %r245, %r244, 4;
or.b32 %r246, %r245, 15;
and.b32 %r7, %r246, -16;
add.s32 %r247, %r246, %r7;
and.b32 %r248, %r247, -16;
cvt.s64.s32 %rd1, %r248;
max.s32 %r249, %r2, %r3;
add.s32 %r250, %r249, 31;
shr.s32 %r251, %r250, 31;
shr.u32 %r252, %r251, 27;
add.s32 %r253, %r250, %r252;
shr.u32 %r254, %r253, 5;
mul.lo.s32 %r255, %r4, %r254;
shl.b32 %r256, %r255, 7;
cvt.u64.u32 %rd2, %r256;
mov.u64 %rd44, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r257, %r8, 7;
setp.lt.s32 %p7, %r257, %r203;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
// end inline asm
shl.b32 %r261, %r5, 4;
add.s32 %r259, %r258, %r261;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r260, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r260, 0;
cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r735, %r6, 4;
add.s32 %r262, %r4, 215;
div.s32 %r263, %r262, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r264, %r11, %r263;
add.s32 %r265, %r264, -1;
div.s32 %r12, %r265, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r203;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r267, %ctaid.y;
mul.lo.s32 %r268, %r12, %r4;
mul.lo.s32 %r13, %r268, %r267;
shl.b32 %r269, %r9, 1;
shl.b32 %r270, %r5, 4;
mad.lo.s32 %r14, %r269, %r203, %r270;
mul.lo.s32 %r271, %r203, %r9;
cvt.s64.s32 %rd53, %r271;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r272, %r13, %r203;
cvt.s64.s32 %rd6, %r272;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
add.s32 %r15, %r271, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r275, %r274, %r16;
shr.u32 %r17, %r5, 5;
add.s32 %r276, %r275, %r17;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r18, %r5, 31;
add.s32 %r277, %r275, %r18;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r734, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r14, %r280;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r14, %r283;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r734, %r4;
add.s32 %r278, %r23, %r9;
add.s32 %r24, %r278, %r13;
setp.gt.s32 %p13, %r24, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r24, %r212;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r24, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r23, %r203;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r24, %r216;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ cvt.f32.f16 %f234, %rs36;}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ cvt.f32.f16 %f235, %rs37;}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ cvt.f32.f16 %f236, %rs38;}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f237, %rs39;}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ cvt.f32.f16 %f238, %rs40;}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ cvt.f32.f16 %f239, %rs41;}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ cvt.f32.f16 %f240, %rs42;}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ cvt.f32.f16 %f241, %rs43;}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ cvt.f32.f16 %f242, %rs44;}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ cvt.f32.f16 %f243, %rs45;}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ cvt.f32.f16 %f244, %rs46;}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ cvt.f32.f16 %f245, %rs47;}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ cvt.f32.f16 %f246, %rs48;}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ cvt.f32.f16 %f247, %rs49;}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ cvt.f32.f16 %f248, %rs50;}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ cvt.f32.f16 %f249, %rs51;}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ cvt.f32.f16 %f250, %rs52;}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ cvt.f32.f16 %f251, %rs53;}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ cvt.f32.f16 %f252, %rs54;}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ cvt.f32.f16 %f253, %rs55;}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ cvt.f32.f16 %f254, %rs56;}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ cvt.f32.f16 %f255, %rs57;}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ cvt.f32.f16 %f256, %rs58;}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ cvt.f32.f16 %f257, %rs59;}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r735, %r735, 2;
bar.sync 0;
setp.ne.s32 %p23, %r18, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r17, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r18, %r16;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r18, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r18, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r17, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r18, %r16;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r18, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ cvt.f32.f16 %f374, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ cvt.f32.f16 %f375, %rs98;}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ cvt.f32.f16 %f376, %rs99;}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ cvt.f32.f16 %f378, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f379, %rs102;}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ cvt.f32.f16 %f380, %rs103;}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ cvt.f32.f16 %f382, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ cvt.f32.f16 %f383, %rs106;}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ cvt.f32.f16 %f384, %rs107;}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ cvt.f32.f16 %f386, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f387, %rs110;}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ cvt.f32.f16 %f388, %rs111;}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ cvt.f32.f16 %f390, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ cvt.f32.f16 %f391, %rs114;}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ cvt.f32.f16 %f392, %rs115;}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ cvt.f32.f16 %f394, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f395, %rs118;}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ cvt.f32.f16 %f396, %rs119;}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ cvt.f32.f16 %f398, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ cvt.f32.f16 %f399, %rs122;}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ cvt.f32.f16 %f400, %rs123;}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ cvt.f32.f16 %f402, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f403, %rs126;}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ cvt.f32.f16 %f404, %rs127;}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
add.s32 %r416, %r13, %r732;
mad.lo.s32 %r417, %r416, %r203, %r15;
mul.wide.s32 %rd76, %r417, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r734, %r734, 1;
setp.lt.s32 %p49, %r734, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r418, %tid.z;
mad.lo.s32 %r46, %r418, %r4, %r9;
mad.lo.s32 %r47, %r46, %r3, %r5;
mul.wide.u32 %rd77, %r47, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r419, %r4;
mov.u32 %r420, 31;
sub.s32 %r48, %r420, %r419;
mov.u32 %r421, 1;
shl.b32 %r766, %r421, %r48;
setp.lt.u32 %p50, %r9, %r766;
add.s32 %r422, %r766, %r9;
setp.lt.u32 %p51, %r422, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r423, %r3, %r48;
add.s32 %r424, %r47, %r423;
mul.wide.s32 %rd79, %r424, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r766, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r736, %r766;
$L__BB0_40:
shr.u32 %r51, %r736, 1;
setp.ge.u32 %p54, %r9, %r51;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r425, %r51, %r3, %r47;
mul.wide.s32 %rd82, %r425, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r736, 7;
mov.u32 %r736, %r51;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r737, 0;
add.s32 %r427, %r47, %r3;
mul.wide.u32 %rd85, %r427, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r737, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r428, %r3, %r48;
add.s32 %r429, %r47, %r428;
mul.wide.s32 %rd87, %r429, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r738, %r766;
$L__BB0_51:
shr.u32 %r55, %r738, 1;
setp.ge.u32 %p60, %r9, %r55;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r430, %r55, %r3, %r47;
mul.wide.s32 %rd90, %r430, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r738, 7;
mov.u32 %r738, %r55;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r739, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r432, %r3, %r48;
add.s32 %r433, %r47, %r432;
mul.wide.s32 %rd93, %r433, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r740, %r766;
$L__BB0_62:
shr.u32 %r59, %r740, 1;
setp.ge.u32 %p66, %r9, %r59;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r434, %r59, %r3, %r47;
mul.wide.s32 %rd96, %r434, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r740, 7;
mov.u32 %r740, %r59;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r741, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r436, %r3, %r48;
add.s32 %r437, %r47, %r436;
mul.wide.s32 %rd99, %r437, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r742, %r766;
$L__BB0_73:
shr.u32 %r63, %r742, 1;
setp.ge.u32 %p72, %r9, %r63;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r438, %r63, %r3, %r47;
mul.wide.s32 %rd102, %r438, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r742, 7;
mov.u32 %r742, %r63;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r743, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r440, %r3, %r48;
add.s32 %r441, %r47, %r440;
mul.wide.s32 %rd105, %r441, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r744, %r766;
$L__BB0_84:
shr.u32 %r67, %r744, 1;
setp.ge.u32 %p78, %r9, %r67;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r442, %r67, %r3, %r47;
mul.wide.s32 %rd108, %r442, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r744, 7;
mov.u32 %r744, %r67;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r745, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r444, %r3, %r48;
add.s32 %r445, %r47, %r444;
mul.wide.s32 %rd111, %r445, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r746, %r766;
$L__BB0_95:
shr.u32 %r71, %r746, 1;
setp.ge.u32 %p84, %r9, %r71;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r446, %r71, %r3, %r47;
mul.wide.s32 %rd114, %r446, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r746, 7;
mov.u32 %r746, %r71;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r747, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r448, %r3, %r48;
add.s32 %r449, %r47, %r448;
mul.wide.s32 %rd117, %r449, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r748, %r766;
$L__BB0_106:
shr.u32 %r75, %r748, 1;
setp.ge.u32 %p90, %r9, %r75;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r450, %r75, %r3, %r47;
mul.wide.s32 %rd120, %r450, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r748, 7;
mov.u32 %r748, %r75;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r749, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r452, %r3, %r48;
add.s32 %r453, %r47, %r452;
mul.wide.s32 %rd123, %r453, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r750, %r766;
$L__BB0_117:
shr.u32 %r79, %r750, 1;
setp.ge.u32 %p96, %r9, %r79;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r454, %r79, %r3, %r47;
mul.wide.s32 %rd126, %r454, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r750, 7;
mov.u32 %r750, %r79;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r751, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r82, %r735, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r456, %r3, %r48;
add.s32 %r457, %r47, %r456;
mul.wide.s32 %rd129, %r457, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r752, %r766;
$L__BB0_128:
shr.u32 %r84, %r752, 1;
setp.ge.u32 %p102, %r9, %r84;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r458, %r84, %r3, %r47;
mul.wide.s32 %rd132, %r458, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r752, 7;
mov.u32 %r752, %r84;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r753, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r460, %r3, %r48;
add.s32 %r461, %r47, %r460;
mul.wide.s32 %rd135, %r461, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r754, %r766;
$L__BB0_139:
shr.u32 %r88, %r754, 1;
setp.ge.u32 %p108, %r9, %r88;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r462, %r88, %r3, %r47;
mul.wide.s32 %rd138, %r462, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r754, 7;
mov.u32 %r754, %r88;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r755, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r464, %r3, %r48;
add.s32 %r465, %r47, %r464;
mul.wide.s32 %rd141, %r465, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r756, %r766;
$L__BB0_150:
shr.u32 %r92, %r756, 1;
setp.ge.u32 %p114, %r9, %r92;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r466, %r92, %r3, %r47;
mul.wide.s32 %rd144, %r466, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r756, 7;
mov.u32 %r756, %r92;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r757, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r468, %r3, %r48;
add.s32 %r469, %r47, %r468;
mul.wide.s32 %rd147, %r469, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r758, %r766;
$L__BB0_161:
shr.u32 %r96, %r758, 1;
setp.ge.u32 %p120, %r9, %r96;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r470, %r96, %r3, %r47;
mul.wide.s32 %rd150, %r470, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r758, 7;
mov.u32 %r758, %r96;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r759, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r472, %r3, %r48;
add.s32 %r473, %r47, %r472;
mul.wide.s32 %rd153, %r473, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r760, %r766;
$L__BB0_172:
shr.u32 %r100, %r760, 1;
setp.ge.u32 %p126, %r9, %r100;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r474, %r100, %r3, %r47;
mul.wide.s32 %rd156, %r474, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r760, 7;
mov.u32 %r760, %r100;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r761, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r476, %r3, %r48;
add.s32 %r477, %r47, %r476;
mul.wide.s32 %rd159, %r477, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r762, %r766;
$L__BB0_183:
shr.u32 %r104, %r762, 1;
setp.ge.u32 %p132, %r9, %r104;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r478, %r104, %r3, %r47;
mul.wide.s32 %rd162, %r478, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r762, 7;
mov.u32 %r762, %r104;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r763, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r763, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r480, %r3, %r48;
add.s32 %r481, %r47, %r480;
mul.wide.s32 %rd165, %r481, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r764, %r766;
$L__BB0_194:
shr.u32 %r108, %r764, 1;
setp.ge.u32 %p138, %r9, %r108;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r482, %r108, %r3, %r47;
mul.wide.s32 %rd168, %r482, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r764, 7;
mov.u32 %r764, %r108;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r765, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r765, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r484, %r3, %r48;
add.s32 %r485, %r47, %r484;
mul.wide.s32 %rd171, %r485, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r112, %r766, 1;
setp.ge.u32 %p144, %r9, %r112;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r486, %r112, %r3, %r47;
mul.wide.s32 %rd174, %r486, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r766, 7;
mov.u32 %r766, %r112;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r767, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r767, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
shl.b32 %r731, %r5, 3;
mov.u32 %r512, %ctaid.y;
mad.lo.s32 %r513, %r203, %r512, %r731;
add.s32 %r514, %r513, %r82;
mul.wide.s32 %rd183, %r514, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
// end inline asm
add.s32 %r515, %r514, 4;
mul.wide.s32 %rd184, %r515, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r488, %r728, 3;
sub.s32 %r115, %r488, %r203;
mov.u32 %r489, %ctaid.y;
mad.lo.s32 %r116, %r203, %r489, %r728;
neg.s32 %r490, %r82;
setp.ge.s32 %p151, %r115, %r490;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r495, %r116, %r82;
mul.wide.s32 %rd178, %r495, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
// end inline asm
$L__BB0_214:
mov.u32 %r496, -4;
sub.s32 %r497, %r496, %r82;
setp.ge.s32 %p153, %r115, %r497;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r502, %r116, %r82;
add.s32 %r503, %r502, 4;
mul.wide.s32 %rd180, %r503, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r730, %r5, 3;
shl.b32 %r540, %r735, 5;
mov.u32 %r541, %ctaid.y;
mad.lo.s32 %r542, %r203, %r541, %r730;
add.s32 %r543, %r542, %r540;
mul.wide.s32 %rd191, %r543, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
// end inline asm
add.s32 %r544, %r543, 4;
mul.wide.s32 %rd192, %r544, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r516, %r729, 3;
sub.s32 %r117, %r516, %r203;
mov.u32 %r517, %ctaid.y;
mad.lo.s32 %r118, %r203, %r517, %r729;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r119, %r735, 5;
neg.s32 %r518, %r119;
setp.ge.s32 %p160, %r117, %r518;
@%p160 bra $L__BB0_222;
add.s32 %r523, %r118, %r119;
mul.wide.s32 %rd186, %r523, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r120, %r735, 5;
mov.u32 %r524, -4;
sub.s32 %r525, %r524, %r120;
setp.ge.s32 %p162, %r117, %r525;
@%p162 bra $L__BB0_227;
add.s32 %r530, %r118, %r120;
add.s32 %r531, %r530, 4;
mul.wide.s32 %rd188, %r531, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
// end inline asm
$L__BB0_227:
mov.u32 %r121, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r545, %r5, %r9;
or.b32 %r547, %r545, %r418;
setp.ne.s32 %p164, %r547, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r548, %ctaid.x;
mov.u32 %r549, %ctaid.z;
mov.u32 %r550, %nctaid.x;
mad.lo.s32 %r551, %r549, %r550, %r548;
mul.wide.s32 %rd194, %r551, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r552, %r11, -1;
setp.eq.s32 %p165, %r121, %r552;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r768, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r768;
// end inline asm
setp.lt.u32 %p167, %r768, 256;
selp.u32 %r555, 1, 0, %p167;
shl.b32 %r768, %r768, %r555;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_dabd7834_1033910nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r557, %r203, 1;
shr.u32 %r558, %r557, 31;
add.s32 %r559, %r557, %r558;
shr.s32 %r560, %r559, 1;
add.s32 %r561, %r4, %r560;
add.s32 %r562, %r561, -1;
div.s32 %r563, %r562, %r4;
add.s32 %r564, %r11, -1;
add.s32 %r565, %r564, %r563;
div.s32 %r124, %r565, %r11;
add.s32 %r125, %r564, %r3;
shl.b32 %r126, %r9, 1;
shl.b32 %r566, %r4, 1;
mad.lo.s32 %r129, %r566, %r121, %r126;
or.b32 %r127, %r129, 1;
mul.lo.s32 %r128, %r566, %r11;
shr.u32 %r130, %r3, 5;
mul.lo.s32 %r567, %r46, %r130;
shr.u32 %r131, %r5, 5;
add.s32 %r568, %r567, %r131;
mul.wide.u32 %rd203, %r568, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r132, %r5, 31;
add.s32 %r569, %r567, %r132;
mul.wide.u32 %rd205, %r569, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r769, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r769, %r769, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r769, %r124;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r154, %r125, %r3;
setp.lt.s32 %p206, %r154, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r650, %r128, %r769;
add.s32 %r155, %r127, %r650;
add.s32 %r156, %r129, %r650;
mov.u32 %r649, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r776, %r649;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r155, %r203;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r158, %r776, %r3, %r5;
setp.ge.s32 %p208, %r158, %r11;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r657, %r158, %r203, %r156;
mul.wide.s32 %rd211, %r657, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r778;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r777;
add.f32 %f769, %f769, %f643;
add.s32 %r776, %r776, 1;
setp.lt.s32 %p209, %r776, %r154;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r658, %f770;
mov.u32 %r659, 31;
mov.u32 %r660, 16;
mov.u32 %r661, -1;
shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
mov.b32 %f644, %r662;
add.f32 %f645, %f770, %f644;
mov.b32 %r663, %f645;
mov.u32 %r664, 8;
shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
mov.b32 %f646, %r665;
add.f32 %f647, %f645, %f646;
mov.b32 %r666, %f647;
mov.u32 %r667, 4;
shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
mov.b32 %f648, %r668;
add.f32 %f649, %f647, %f648;
mov.b32 %r669, %f649;
mov.u32 %r670, 2;
shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
mov.b32 %f650, %r671;
add.f32 %f651, %f649, %f650;
mov.b32 %r672, %f651;
mov.u32 %r673, 1;
shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
mov.b32 %f652, %r674;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r132, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r131, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r132, %r130;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r675, %f771;
mov.u32 %r676, 31;
mov.u32 %r677, 16;
mov.u32 %r678, -1;
shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
mov.b32 %f654, %r679;
add.f32 %f655, %f771, %f654;
mov.b32 %r680, %f655;
mov.u32 %r681, 8;
shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
mov.b32 %f656, %r682;
add.f32 %f657, %f655, %f656;
mov.b32 %r683, %f657;
mov.u32 %r684, 4;
shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
mov.b32 %f658, %r685;
add.f32 %f659, %f657, %f658;
mov.b32 %r686, %f659;
mov.u32 %r687, 2;
shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
mov.b32 %f660, %r688;
add.f32 %f661, %f659, %f660;
mov.b32 %r689, %f661;
mov.u32 %r690, 1;
shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
mov.b32 %f662, %r691;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r132, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r692, %f769;
mov.u32 %r693, 31;
mov.u32 %r694, 16;
mov.u32 %r695, -1;
shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
mov.b32 %f665, %r696;
add.f32 %f666, %f769, %f665;
mov.b32 %r697, %f666;
mov.u32 %r698, 8;
shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
mov.b32 %f667, %r699;
add.f32 %f668, %f666, %f667;
mov.b32 %r700, %f668;
mov.u32 %r701, 4;
shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
mov.b32 %f669, %r702;
add.f32 %f670, %f668, %f669;
mov.b32 %r703, %f670;
mov.u32 %r704, 2;
shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
mov.b32 %f671, %r705;
add.f32 %f672, %f670, %f671;
mov.b32 %r706, %f672;
mov.u32 %r707, 1;
shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
mov.b32 %f673, %r708;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r132, %r130;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r709, %f773;
mov.u32 %r710, 31;
mov.u32 %r711, 16;
mov.u32 %r712, -1;
shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
mov.b32 %f675, %r713;
add.f32 %f676, %f773, %f675;
mov.b32 %r714, %f676;
mov.u32 %r715, 8;
shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
mov.b32 %f677, %r716;
add.f32 %f678, %f676, %f677;
mov.b32 %r717, %f678;
mov.u32 %r718, 4;
shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
mov.b32 %f679, %r719;
add.f32 %f680, %f678, %f679;
mov.b32 %r720, %f680;
mov.u32 %r721, 2;
shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
mov.b32 %f681, %r722;
add.f32 %f682, %f680, %f681;
mov.b32 %r723, %f682;
mov.u32 %r724, 1;
shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
mov.b32 %f683, %r725;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r164, %r128, %r769;
add.s32 %r726, %r127, %r164;
setp.ge.s32 %p239, %r726, %r203;
@%p239 bra $L__BB0_279;
add.s32 %r727, %r129, %r164;
mul.wide.s32 %rd212, %r727, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r124, 1;
@%p170 bra $L__BB0_257;
div.s32 %r134, %r125, %r3;
mad.lo.s32 %r135, %r203, %r5, %r126;
shl.b32 %r136, %r121, 1;
shl.b32 %r137, %r11, 1;
mul.lo.s32 %r138, %r203, %r3;
mov.u32 %r770, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r134, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r140, %r128, %r770, %r127;
mad.lo.s32 %r572, %r137, %r770, %r136;
mad.lo.s32 %r772, %r4, %r572, %r135;
mov.u32 %r571, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r771, %r5;
mov.u32 %r773, %r571;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r140, %r203;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r771, %r11;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r772, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r775;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r774;
add.f32 %f761, %f761, %f595;
add.s32 %r772, %r772, %r138;
add.s32 %r771, %r771, %r3;
add.s32 %r773, %r773, 1;
setp.lt.s32 %p174, %r773, %r134;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r579, %f762;
mov.u32 %r580, 31;
mov.u32 %r581, 16;
mov.u32 %r582, -1;
shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
mov.b32 %f596, %r583;
add.f32 %f597, %f762, %f596;
mov.b32 %r584, %f597;
mov.u32 %r585, 8;
shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
mov.b32 %f598, %r586;
add.f32 %f599, %f597, %f598;
mov.b32 %r587, %f599;
mov.u32 %r588, 4;
shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
mov.b32 %f600, %r589;
add.f32 %f601, %f599, %f600;
mov.b32 %r590, %f601;
mov.u32 %r591, 2;
shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
mov.b32 %f602, %r592;
add.f32 %f603, %f601, %f602;
mov.b32 %r593, %f603;
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
mov.b32 %f604, %r595;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r132, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r131, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r132, %r130;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r596, %f763;
mov.u32 %r597, 31;
mov.u32 %r598, 16;
mov.u32 %r599, -1;
shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
mov.b32 %f606, %r600;
add.f32 %f607, %f763, %f606;
mov.b32 %r601, %f607;
mov.u32 %r602, 8;
shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
mov.b32 %f608, %r603;
add.f32 %f609, %f607, %f608;
mov.b32 %r604, %f609;
mov.u32 %r605, 4;
shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
mov.b32 %f610, %r606;
add.f32 %f611, %f609, %f610;
mov.b32 %r607, %f611;
mov.u32 %r608, 2;
shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
mov.b32 %f612, %r609;
add.f32 %f613, %f611, %f612;
mov.b32 %r610, %f613;
mov.u32 %r611, 1;
shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
mov.b32 %f614, %r612;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r132, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r613, %f761;
mov.u32 %r614, 31;
mov.u32 %r615, 16;
mov.u32 %r616, -1;
shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
mov.b32 %f617, %r617;
add.f32 %f618, %f761, %f617;
mov.b32 %r618, %f618;
mov.u32 %r619, 8;
shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
mov.b32 %f619, %r620;
add.f32 %f620, %f618, %f619;
mov.b32 %r621, %f620;
mov.u32 %r622, 4;
shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
mov.b32 %f621, %r623;
add.f32 %f622, %f620, %f621;
mov.b32 %r624, %f622;
mov.u32 %r625, 2;
shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
mov.b32 %f623, %r626;
add.f32 %f624, %f622, %f623;
mov.b32 %r627, %f624;
mov.u32 %r628, 1;
shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
mov.b32 %f625, %r629;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r132, %r130;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r630, %f765;
mov.u32 %r631, 31;
mov.u32 %r632, 16;
mov.u32 %r633, -1;
shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
mov.b32 %f627, %r634;
add.f32 %f628, %f765, %f627;
mov.b32 %r635, %f628;
mov.u32 %r636, 8;
shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
mov.b32 %f629, %r637;
add.f32 %f630, %f628, %f629;
mov.b32 %r638, %f630;
mov.u32 %r639, 4;
shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
mov.b32 %f631, %r640;
add.f32 %f632, %f630, %f631;
mov.b32 %r641, %f632;
mov.u32 %r642, 2;
shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
mov.b32 %f633, %r643;
add.f32 %f634, %f632, %f633;
mov.b32 %r644, %f634;
mov.u32 %r645, 1;
shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
mov.b32 %f635, %r646;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r152, %r128, %r770;
add.s32 %r647, %r127, %r152;
setp.ge.s32 %p204, %r647, %r203;
@%p204 bra $L__BB0_256;
add.s32 %r648, %r129, %r152;
mul.wide.s32 %rd208, %r648, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r770, %r770, 1;
setp.lt.s32 %p205, %r770, %r124;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
max.s32 %r248, %r2, %r3;
add.s32 %r249, %r248, 31;
shr.s32 %r250, %r249, 31;
shr.u32 %r251, %r250, 27;
add.s32 %r252, %r249, %r251;
shr.u32 %r253, %r252, 5;
mul.lo.s32 %r254, %r4, %r253;
shl.b32 %r255, %r254, 7;
cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r256, %r8, 7;
setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
shl.b32 %r260, %r5, 4;
add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r729, %r6, 4;
add.s32 %r261, %r4, 215;
div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r263, %r11, %r262;
add.s32 %r264, %r263, -1;
div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r266, %ctaid.y;
mul.lo.s32 %r267, %r12, %r4;
mul.lo.s32 %r13, %r267, %r266;
mad.lo.s32 %r268, %r2, %r9, %r5;
shl.b32 %r14, %r268, 4;
mul.lo.s32 %r269, %r202, %r9;
cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r270, %r13, %r202;
cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
shl.b32 %r271, %r9, 3;
mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r275, %r274, %r15;
shr.u32 %r16, %r5, 5;
add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r17, %r5, 31;
add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r280, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r728, %r4;
add.s32 %r278, %r22, %r9;
add.s32 %r23, %r278, %r13;
setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ cvt.f32.f16 %f234, %rs36;}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ cvt.f32.f16 %f235, %rs37;}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ cvt.f32.f16 %f236, %rs38;}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ cvt.f32.f16 %f237, %rs39;}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ cvt.f32.f16 %f238, %rs40;}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ cvt.f32.f16 %f239, %rs41;}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ cvt.f32.f16 %f240, %rs42;}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ cvt.f32.f16 %f241, %rs43;}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ cvt.f32.f16 %f242, %rs44;}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ cvt.f32.f16 %f243, %rs45;}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ cvt.f32.f16 %f244, %rs46;}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ cvt.f32.f16 %f245, %rs47;}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ cvt.f32.f16 %f246, %rs48;}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ cvt.f32.f16 %f247, %rs49;}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ cvt.f32.f16 %f248, %rs50;}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ cvt.f32.f16 %f249, %rs51;}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ cvt.f32.f16 %f250, %rs52;}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ cvt.f32.f16 %f251, %rs53;}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ cvt.f32.f16 %f252, %rs54;}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ cvt.f32.f16 %f253, %rs55;}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ cvt.f32.f16 %f254, %rs56;}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ cvt.f32.f16 %f255, %rs57;}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ cvt.f32.f16 %f256, %rs58;}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ cvt.f32.f16 %f257, %rs59;}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r729, %r729, 2;
bar.sync 0;
setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.f16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ cvt.f32.f16 %f374, %rs97;}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ cvt.f32.f16 %f375, %rs98;}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ cvt.f32.f16 %f376, %rs99;}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ cvt.f32.f16 %f378, %rs101;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f379, %rs102;}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ cvt.f32.f16 %f380, %rs103;}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.f16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ cvt.f32.f16 %f382, %rs105;}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ cvt.f32.f16 %f383, %rs106;}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ cvt.f32.f16 %f384, %rs107;}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ cvt.f32.f16 %f386, %rs109;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f387, %rs110;}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ cvt.f32.f16 %f388, %rs111;}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.f16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ cvt.f32.f16 %f390, %rs113;}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ cvt.f32.f16 %f391, %rs114;}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ cvt.f32.f16 %f392, %rs115;}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ cvt.f32.f16 %f394, %rs117;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f395, %rs118;}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ cvt.f32.f16 %f396, %rs119;}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.f16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ cvt.f32.f16 %f398, %rs121;}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ cvt.f32.f16 %f399, %rs122;}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ cvt.f32.f16 %f400, %rs123;}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ cvt.f32.f16 %f402, %rs125;}
// end inline asm
// begin inline asm
{ cvt.f32.f16 %f403, %rs126;}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ cvt.f32.f16 %f404, %rs127;}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.f16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
mad.lo.s32 %r416, %r23, %r202, %r8;
mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.f16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r728, %r728, 1;
setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r417, %tid.z;
mad.lo.s32 %r45, %r417, %r4, %r9;
mad.lo.s32 %r46, %r45, %r3, %r5;
mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r418, %r4;
mov.u32 %r419, 31;
sub.s32 %r47, %r419, %r418;
mov.u32 %r420, 1;
shl.b32 %r760, %r420, %r47;
setp.lt.u32 %p50, %r9, %r760;
add.s32 %r421, %r760, %r9;
setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r422, %r3, %r47;
add.s32 %r423, %r46, %r422;
mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r730, %r760;
$L__BB0_40:
shr.u32 %r50, %r730, 1;
setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r424, %r50, %r3, %r46;
mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r730, 7;
mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r731, 0;
add.s32 %r426, %r46, %r3;
mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r427, %r3, %r47;
add.s32 %r428, %r46, %r427;
mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r732, %r760;
$L__BB0_51:
shr.u32 %r54, %r732, 1;
setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r429, %r54, %r3, %r46;
mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r732, 7;
mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r431, %r3, %r47;
add.s32 %r432, %r46, %r431;
mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r734, %r760;
$L__BB0_62:
shr.u32 %r58, %r734, 1;
setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r433, %r58, %r3, %r46;
mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r734, 7;
mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r435, %r3, %r47;
add.s32 %r436, %r46, %r435;
mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r736, %r760;
$L__BB0_73:
shr.u32 %r62, %r736, 1;
setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r437, %r62, %r3, %r46;
mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r736, 7;
mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r439, %r3, %r47;
add.s32 %r440, %r46, %r439;
mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r738, %r760;
$L__BB0_84:
shr.u32 %r66, %r738, 1;
setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r441, %r66, %r3, %r46;
mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r738, 7;
mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r443, %r3, %r47;
add.s32 %r444, %r46, %r443;
mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r740, %r760;
$L__BB0_95:
shr.u32 %r70, %r740, 1;
setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r445, %r70, %r3, %r46;
mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r740, 7;
mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r447, %r3, %r47;
add.s32 %r448, %r46, %r447;
mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r742, %r760;
$L__BB0_106:
shr.u32 %r74, %r742, 1;
setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r449, %r74, %r3, %r46;
mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r742, 7;
mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r451, %r3, %r47;
add.s32 %r452, %r46, %r451;
mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r744, %r760;
$L__BB0_117:
shr.u32 %r78, %r744, 1;
setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r453, %r78, %r3, %r46;
mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r744, 7;
mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r455, %r3, %r47;
add.s32 %r456, %r46, %r455;
mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r746, %r760;
$L__BB0_128:
shr.u32 %r83, %r746, 1;
setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r457, %r83, %r3, %r46;
mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r746, 7;
mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r459, %r3, %r47;
add.s32 %r460, %r46, %r459;
mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r748, %r760;
$L__BB0_139:
shr.u32 %r87, %r748, 1;
setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r461, %r87, %r3, %r46;
mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r748, 7;
mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r463, %r3, %r47;
add.s32 %r464, %r46, %r463;
mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r750, %r760;
$L__BB0_150:
shr.u32 %r91, %r750, 1;
setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r465, %r91, %r3, %r46;
mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r750, 7;
mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r467, %r3, %r47;
add.s32 %r468, %r46, %r467;
mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r752, %r760;
$L__BB0_161:
shr.u32 %r95, %r752, 1;
setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r469, %r95, %r3, %r46;
mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r752, 7;
mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r471, %r3, %r47;
add.s32 %r472, %r46, %r471;
mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r754, %r760;
$L__BB0_172:
shr.u32 %r99, %r754, 1;
setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r473, %r99, %r3, %r46;
mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r754, 7;
mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r475, %r3, %r47;
add.s32 %r476, %r46, %r475;
mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r756, %r760;
$L__BB0_183:
shr.u32 %r103, %r756, 1;
setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r477, %r103, %r3, %r46;
mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r756, 7;
mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r479, %r3, %r47;
add.s32 %r480, %r46, %r479;
mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r758, %r760;
$L__BB0_194:
shr.u32 %r107, %r758, 1;
setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r481, %r107, %r3, %r46;
mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r758, 7;
mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r483, %r3, %r47;
add.s32 %r484, %r46, %r483;
mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r111, %r760, 1;
setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r485, %r111, %r3, %r46;
mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r760, 7;
mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
mov.u32 %r511, %ctaid.y;
mad.lo.s32 %r512, %r202, %r511, %r8;
add.s32 %r513, %r512, %r81;
mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
// end inline asm
add.s32 %r514, %r513, 4;
mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r487, %r8, 3;
sub.s32 %r114, %r487, %r202;
mov.u32 %r488, %ctaid.y;
mad.lo.s32 %r115, %r202, %r488, %r8;
neg.s32 %r489, %r81;
setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r494, %r115, %r81;
mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
// end inline asm
$L__BB0_214:
mov.u32 %r495, -4;
sub.s32 %r496, %r495, %r81;
setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r501, %r115, %r81;
add.s32 %r502, %r501, 4;
mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r539, %r729, 5;
mov.u32 %r540, %ctaid.y;
mad.lo.s32 %r541, %r202, %r540, %r8;
add.s32 %r542, %r541, %r539;
mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
// end inline asm
add.s32 %r543, %r542, 4;
mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r515, %r8, 3;
sub.s32 %r116, %r515, %r202;
mov.u32 %r516, %ctaid.y;
mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r118, %r729, 5;
neg.s32 %r517, %r118;
setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
add.s32 %r522, %r117, %r118;
mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r119, %r729, 5;
mov.u32 %r523, -4;
sub.s32 %r524, %r523, %r119;
setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
add.s32 %r529, %r117, %r119;
add.s32 %r530, %r529, 4;
mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
// end inline asm
$L__BB0_227:
mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r544, %r5, %r9;
or.b32 %r546, %r544, %r417;
setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r547, %ctaid.x;
mov.u32 %r548, %ctaid.z;
mov.u32 %r549, %nctaid.x;
mad.lo.s32 %r550, %r548, %r549, %r547;
mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r551, %r11, -1;
setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r762, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r762;
// end inline asm
setp.lt.u32 %p167, %r762, 256;
selp.u32 %r554, 1, 0, %p167;
shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_32_cu_c94510e2_723310nvfuser_32ENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r556, %r202, 1;
shr.u32 %r557, %r556, 31;
add.s32 %r558, %r556, %r557;
shr.s32 %r559, %r558, 1;
add.s32 %r560, %r4, %r559;
add.s32 %r561, %r560, -1;
div.s32 %r562, %r561, %r4;
add.s32 %r563, %r11, -1;
add.s32 %r564, %r563, %r562;
div.s32 %r123, %r564, %r11;
add.s32 %r124, %r563, %r3;
shl.b32 %r125, %r9, 1;
shl.b32 %r565, %r4, 1;
mad.lo.s32 %r128, %r565, %r120, %r125;
or.b32 %r126, %r128, 1;
mul.lo.s32 %r127, %r565, %r11;
shr.u32 %r129, %r3, 5;
mul.lo.s32 %r566, %r45, %r129;
shr.u32 %r130, %r5, 5;
add.s32 %r567, %r566, %r130;
mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r131, %r5, 31;
add.s32 %r568, %r566, %r131;
mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r153, %r124, %r3;
setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r649, %r127, %r763;
add.s32 %r154, %r126, %r649;
add.s32 %r155, %r128, %r649;
mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r154, %r202;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r157, %r770, %r3, %r5;
setp.ge.s32 %p208, %r157, %r11;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r656, %r157, %r202, %r155;
mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
add.s32 %r770, %r770, 1;
setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r657, %f770;
mov.u32 %r658, 31;
mov.u32 %r659, 16;
mov.u32 %r660, -1;
shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
mov.b32 %r662, %f645;
mov.u32 %r663, 8;
shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
mov.b32 %r665, %f647;
mov.u32 %r666, 4;
shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
mov.b32 %r668, %f649;
mov.u32 %r669, 2;
shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
mov.b32 %r671, %f651;
mov.u32 %r672, 1;
shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r674, %f771;
mov.u32 %r675, 31;
mov.u32 %r676, 16;
mov.u32 %r677, -1;
shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
mov.b32 %r679, %f655;
mov.u32 %r680, 8;
shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
mov.b32 %r682, %f657;
mov.u32 %r683, 4;
shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
mov.b32 %r685, %f659;
mov.u32 %r686, 2;
shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
mov.b32 %r688, %f661;
mov.u32 %r689, 1;
shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r691, %f769;
mov.u32 %r692, 31;
mov.u32 %r693, 16;
mov.u32 %r694, -1;
shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
mov.b32 %r696, %f666;
mov.u32 %r697, 8;
shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
mov.b32 %r699, %f668;
mov.u32 %r700, 4;
shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
mov.b32 %r702, %f670;
mov.u32 %r703, 2;
shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
mov.b32 %r705, %f672;
mov.u32 %r706, 1;
shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r708, %f773;
mov.u32 %r709, 31;
mov.u32 %r710, 16;
mov.u32 %r711, -1;
shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
mov.b32 %r713, %f676;
mov.u32 %r714, 8;
shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
mov.b32 %r716, %f678;
mov.u32 %r717, 4;
shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
mov.b32 %r719, %f680;
mov.u32 %r720, 2;
shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
mov.b32 %r722, %f682;
mov.u32 %r723, 1;
shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r163, %r127, %r763;
add.s32 %r725, %r126, %r163;
setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
add.s32 %r726, %r128, %r163;
mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
div.s32 %r133, %r124, %r3;
mad.lo.s32 %r134, %r202, %r5, %r125;
shl.b32 %r135, %r120, 1;
shl.b32 %r136, %r11, 1;
mul.lo.s32 %r137, %r202, %r3;
mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r139, %r127, %r764, %r126;
mad.lo.s32 %r571, %r136, %r764, %r135;
mad.lo.s32 %r766, %r4, %r571, %r134;
mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r765, %r5;
mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r139, %r202;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r765, %r11;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
add.s32 %r766, %r766, %r137;
add.s32 %r765, %r765, %r3;
add.s32 %r767, %r767, 1;
setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r578, %f762;
mov.u32 %r579, 31;
mov.u32 %r580, 16;
mov.u32 %r581, -1;
shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
mov.b32 %r583, %f597;
mov.u32 %r584, 8;
shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
mov.b32 %r586, %f599;
mov.u32 %r587, 4;
shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
mov.b32 %r589, %f601;
mov.u32 %r590, 2;
shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
mov.b32 %r592, %f603;
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r595, %f763;
mov.u32 %r596, 31;
mov.u32 %r597, 16;
mov.u32 %r598, -1;
shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
mov.b32 %r600, %f607;
mov.u32 %r601, 8;
shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
mov.b32 %r603, %f609;
mov.u32 %r604, 4;
shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
mov.b32 %r606, %f611;
mov.u32 %r607, 2;
shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
mov.b32 %r609, %f613;
mov.u32 %r610, 1;
shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r612, %f761;
mov.u32 %r613, 31;
mov.u32 %r614, 16;
mov.u32 %r615, -1;
shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
mov.b32 %r617, %f618;
mov.u32 %r618, 8;
shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
mov.b32 %r620, %f620;
mov.u32 %r621, 4;
shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
mov.b32 %r623, %f622;
mov.u32 %r624, 2;
shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
mov.b32 %r626, %f624;
mov.u32 %r627, 1;
shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r629, %f765;
mov.u32 %r630, 31;
mov.u32 %r631, 16;
mov.u32 %r632, -1;
shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
mov.b32 %r634, %f628;
mov.u32 %r635, 8;
shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
mov.b32 %r637, %f630;
mov.u32 %r638, 4;
shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
mov.b32 %r640, %f632;
mov.u32 %r641, 2;
shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
mov.b32 %r643, %f634;
mov.u32 %r644, 1;
shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.f16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r151, %r127, %r764;
add.s32 %r646, %r126, %r151;
setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
add.s32 %r647, %r128, %r151;
mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r764, %r764, 1;
setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,166 +32,166 @@
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
- .reg .b32 %r<779>;
+ .reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r202, %r203}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r212, %r213}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r216, %r217}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r238, %r203, 7;
- shr.s32 %r239, %r238, 31;
- shr.u32 %r240, %r239, 29;
- add.s32 %r241, %r238, %r240;
- shr.s32 %r2, %r241, 3;
+ add.s32 %r237, %r202, 7;
+ shr.s32 %r238, %r237, 31;
+ shr.u32 %r239, %r238, 29;
+ add.s32 %r240, %r237, %r239;
+ shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
- mov.u32 %r242, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
+ mov.u32 %r241, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r243, [%rd43], %r5;
+ atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r244, %r4, %r2;
- shl.b32 %r245, %r244, 4;
- or.b32 %r246, %r245, 15;
- and.b32 %r7, %r246, -16;
- add.s32 %r247, %r246, %r7;
- and.b32 %r248, %r247, -16;
- cvt.s64.s32 %rd1, %r248;
- max.s32 %r249, %r2, %r3;
- add.s32 %r250, %r249, 31;
- shr.s32 %r251, %r250, 31;
- shr.u32 %r252, %r251, 27;
- add.s32 %r253, %r250, %r252;
- shr.u32 %r254, %r253, 5;
- mul.lo.s32 %r255, %r4, %r254;
- shl.b32 %r256, %r255, 7;
- cvt.u64.u32 %rd2, %r256;
+ mul.lo.s32 %r243, %r4, %r2;
+ shl.b32 %r244, %r243, 4;
+ or.b32 %r245, %r244, 15;
+ and.b32 %r7, %r245, -16;
+ add.s32 %r246, %r245, %r7;
+ and.b32 %r247, %r246, -16;
+ cvt.s64.s32 %rd1, %r247;
+ max.s32 %r248, %r2, %r3;
+ add.s32 %r249, %r248, 31;
+ shr.s32 %r250, %r249, 31;
+ shr.u32 %r251, %r250, 27;
+ add.s32 %r252, %r249, %r251;
+ shr.u32 %r253, %r252, 5;
+ mul.lo.s32 %r254, %r4, %r253;
+ shl.b32 %r255, %r254, 7;
+ cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r257, %r8, 7;
- setp.lt.s32 %p7, %r257, %r203;
+ or.b32 %r256, %r8, 7;
+ setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
-
-
- shl.b32 %r261, %r5, 4;
- add.s32 %r259, %r258, %r261;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
+
+
+ shl.b32 %r260, %r5, 4;
+ add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
- mov.u32 %r260, 0;
+ mov.u32 %r259, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r260, 0;
- cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r259, 0;
+ cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r735, %r6, 4;
- add.s32 %r262, %r4, 215;
- div.s32 %r263, %r262, %r4;
+ shl.b32 %r729, %r6, 4;
+ add.s32 %r261, %r4, 215;
+ div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r264, %r11, %r263;
- add.s32 %r265, %r264, -1;
- div.s32 %r12, %r265, %r11;
+ add.s32 %r263, %r11, %r262;
+ add.s32 %r264, %r263, -1;
+ div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r203;
+ cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
- mov.u32 %r267, %ctaid.y;
- mul.lo.s32 %r268, %r12, %r4;
- mul.lo.s32 %r13, %r268, %r267;
- shl.b32 %r269, %r9, 1;
- shl.b32 %r270, %r5, 4;
- mad.lo.s32 %r14, %r269, %r203, %r270;
- mul.lo.s32 %r271, %r203, %r9;
- cvt.s64.s32 %rd53, %r271;
+ mov.u32 %r266, %ctaid.y;
+ mul.lo.s32 %r267, %r12, %r4;
+ mul.lo.s32 %r13, %r267, %r266;
+ mad.lo.s32 %r268, %r2, %r9, %r5;
+ shl.b32 %r14, %r268, 4;
+ mul.lo.s32 %r269, %r202, %r9;
+ cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r272, %r13, %r203;
- cvt.s64.s32 %rd6, %r272;
+ mul.lo.s32 %r270, %r13, %r202;
+ cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- add.s32 %r15, %r271, %r8;
+ shl.b32 %r271, %r9, 3;
+ mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 2;
+ mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r275, %r274, %r16;
- shr.u32 %r17, %r5, 5;
- add.s32 %r276, %r275, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r275, %r274, %r15;
+ shr.u32 %r16, %r5, 5;
+ add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
- and.b32 %r18, %r5, 31;
- add.s32 %r277, %r275, %r18;
+ and.b32 %r17, %r5, 31;
+ add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
- mov.u32 %r734, 0;
+ mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
- add.s32 %r281, %r14, %r280;
+ add.s32 %r281, %r280, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
- add.s32 %r284, %r14, %r283;
+ add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
@@ -207,29 +207,29 @@
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r734, %r4;
- add.s32 %r278, %r23, %r9;
- add.s32 %r24, %r278, %r13;
- setp.gt.s32 %p13, %r24, 215;
+ mul.lo.s32 %r22, %r728, %r4;
+ add.s32 %r278, %r22, %r9;
+ add.s32 %r23, %r278, %r13;
+ setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
- mul.lo.s32 %r279, %r24, %r212;
+ mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p14, %r24, 216;
+ setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
- mul.lo.s32 %r286, %r23, %r203;
+ mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
@@ -255,11 +255,11 @@
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
- mul.lo.s32 %r287, %r24, %r216;
+ mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
@@ -478,23 +478,23 @@
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
- shl.b32 %r735, %r735, 2;
- bar.sync 0;
- setp.ne.s32 %p23, %r18, 0;
+ shl.b32 %r729, %r729, 2;
+ bar.sync 0;
+ setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
- setp.ne.s32 %p24, %r17, 0;
+ setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
- setp.ge.u32 %p25, %r18, %r16;
+ setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
@@ -526,11 +526,11 @@
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
- setp.ne.s32 %p242, %r18, 0;
+ setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
@@ -556,23 +556,23 @@
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
- setp.eq.s32 %p37, %r18, 0;
+ setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
- setp.ne.s32 %p240, %r17, 0;
+ setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
- setp.ge.u32 %p39, %r18, %r16;
+ setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
@@ -615,11 +615,11 @@
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
- setp.eq.s32 %p241, %r18, 0;
+ setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
@@ -633,11 +633,10 @@
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
- mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
@@ -846,13 +845,12 @@
{ cvt.rn.f16.f32 %rs124, %f401;}
mov.b32 %r391, {%rs124, %rs128};
- add.s32 %r416, %r13, %r732;
- mad.lo.s32 %r417, %r416, %r203, %r15;
- mul.wide.s32 %rd76, %r417, 2;
+ mad.lo.s32 %r416, %r23, %r202, %r8;
+ mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
bra.uni $L__BB0_35;
@@ -862,12 +860,12 @@
{ cvt.rn.f16.f32 %rs61, %f337;}
$L__BB0_35:
- add.s32 %r734, %r734, 1;
- setp.lt.s32 %p49, %r734, %r12;
+ add.s32 %r728, %r728, 1;
+ setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
@@ -886,68 +884,68 @@
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
- mov.u32 %r418, %tid.z;
- mad.lo.s32 %r46, %r418, %r4, %r9;
- mad.lo.s32 %r47, %r46, %r3, %r5;
- mul.wide.u32 %rd77, %r47, 4;
+ mov.u32 %r417, %tid.z;
+ mad.lo.s32 %r45, %r417, %r4, %r9;
+ mad.lo.s32 %r46, %r45, %r3, %r5;
+ mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
- clz.b32 %r419, %r4;
- mov.u32 %r420, 31;
- sub.s32 %r48, %r420, %r419;
- mov.u32 %r421, 1;
- shl.b32 %r766, %r421, %r48;
- setp.lt.u32 %p50, %r9, %r766;
- add.s32 %r422, %r766, %r9;
- setp.lt.u32 %p51, %r422, %r4;
+ clz.b32 %r418, %r4;
+ mov.u32 %r419, 31;
+ sub.s32 %r47, %r419, %r418;
+ mov.u32 %r420, 1;
+ shl.b32 %r760, %r420, %r47;
+ setp.lt.u32 %p50, %r9, %r760;
+ add.s32 %r421, %r760, %r9;
+ setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
- shl.b32 %r423, %r3, %r48;
- add.s32 %r424, %r47, %r423;
- mul.wide.s32 %rd79, %r424, 4;
+ shl.b32 %r422, %r3, %r47;
+ add.s32 %r423, %r46, %r422;
+ mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
- setp.lt.s32 %p53, %r766, 4;
+ setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
- mov.u32 %r736, %r766;
+ mov.u32 %r730, %r760;
$L__BB0_40:
- shr.u32 %r51, %r736, 1;
- setp.ge.u32 %p54, %r9, %r51;
+ shr.u32 %r50, %r730, 1;
+ setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
- mad.lo.s32 %r425, %r51, %r3, %r47;
- mul.wide.s32 %rd82, %r425, 4;
+ mad.lo.s32 %r424, %r50, %r3, %r46;
+ mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
- setp.gt.u32 %p55, %r736, 7;
- mov.u32 %r736, %r51;
+ setp.gt.u32 %p55, %r730, 7;
+ mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
- mov.u32 %r737, 0;
- add.s32 %r427, %r47, %r3;
- mul.wide.u32 %rd85, %r427, 4;
+ mov.u32 %r731, 0;
+ add.s32 %r426, %r46, %r3;
+ mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
@@ -956,54 +954,54 @@
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
- mov.b32 %r737, %f743;
+ mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
- shl.b32 %r428, %r3, %r48;
- add.s32 %r429, %r47, %r428;
- mul.wide.s32 %rd87, %r429, 4;
+ shl.b32 %r427, %r3, %r47;
+ add.s32 %r428, %r46, %r427;
+ mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
- mov.u32 %r738, %r766;
+ mov.u32 %r732, %r760;
$L__BB0_51:
- shr.u32 %r55, %r738, 1;
- setp.ge.u32 %p60, %r9, %r55;
+ shr.u32 %r54, %r732, 1;
+ setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
- mad.lo.s32 %r430, %r55, %r3, %r47;
- mul.wide.s32 %rd90, %r430, 4;
+ mad.lo.s32 %r429, %r54, %r3, %r46;
+ mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
- setp.gt.u32 %p61, %r738, 7;
- mov.u32 %r738, %r55;
+ setp.gt.u32 %p61, %r732, 7;
+ mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
- mov.u32 %r739, 0;
+ mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@@ -1011,54 +1009,54 @@
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
- mov.b32 %r739, %f744;
+ mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
- shl.b32 %r432, %r3, %r48;
- add.s32 %r433, %r47, %r432;
- mul.wide.s32 %rd93, %r433, 4;
+ shl.b32 %r431, %r3, %r47;
+ add.s32 %r432, %r46, %r431;
+ mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
- mov.u32 %r740, %r766;
+ mov.u32 %r734, %r760;
$L__BB0_62:
- shr.u32 %r59, %r740, 1;
- setp.ge.u32 %p66, %r9, %r59;
+ shr.u32 %r58, %r734, 1;
+ setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
- mad.lo.s32 %r434, %r59, %r3, %r47;
- mul.wide.s32 %rd96, %r434, 4;
+ mad.lo.s32 %r433, %r58, %r3, %r46;
+ mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
- setp.gt.u32 %p67, %r740, 7;
- mov.u32 %r740, %r59;
+ setp.gt.u32 %p67, %r734, 7;
+ mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
- mov.u32 %r741, 0;
+ mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@@ -1066,54 +1064,54 @@
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
- mov.b32 %r741, %f745;
+ mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
- shl.b32 %r436, %r3, %r48;
- add.s32 %r437, %r47, %r436;
- mul.wide.s32 %rd99, %r437, 4;
+ shl.b32 %r435, %r3, %r47;
+ add.s32 %r436, %r46, %r435;
+ mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
- mov.u32 %r742, %r766;
+ mov.u32 %r736, %r760;
$L__BB0_73:
- shr.u32 %r63, %r742, 1;
- setp.ge.u32 %p72, %r9, %r63;
+ shr.u32 %r62, %r736, 1;
+ setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
- mad.lo.s32 %r438, %r63, %r3, %r47;
- mul.wide.s32 %rd102, %r438, 4;
+ mad.lo.s32 %r437, %r62, %r3, %r46;
+ mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
- setp.gt.u32 %p73, %r742, 7;
- mov.u32 %r742, %r63;
+ setp.gt.u32 %p73, %r736, 7;
+ mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
- mov.u32 %r743, 0;
+ mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@@ -1121,54 +1119,54 @@
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
- mov.b32 %r743, %f746;
+ mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
- shl.b32 %r440, %r3, %r48;
- add.s32 %r441, %r47, %r440;
- mul.wide.s32 %rd105, %r441, 4;
+ shl.b32 %r439, %r3, %r47;
+ add.s32 %r440, %r46, %r439;
+ mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
- mov.u32 %r744, %r766;
+ mov.u32 %r738, %r760;
$L__BB0_84:
- shr.u32 %r67, %r744, 1;
- setp.ge.u32 %p78, %r9, %r67;
+ shr.u32 %r66, %r738, 1;
+ setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
- mad.lo.s32 %r442, %r67, %r3, %r47;
- mul.wide.s32 %rd108, %r442, 4;
+ mad.lo.s32 %r441, %r66, %r3, %r46;
+ mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
- setp.gt.u32 %p79, %r744, 7;
- mov.u32 %r744, %r67;
+ setp.gt.u32 %p79, %r738, 7;
+ mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
- mov.u32 %r745, 0;
+ mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@@ -1176,54 +1174,54 @@
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
- mov.b32 %r745, %f747;
+ mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
- shl.b32 %r444, %r3, %r48;
- add.s32 %r445, %r47, %r444;
- mul.wide.s32 %rd111, %r445, 4;
+ shl.b32 %r443, %r3, %r47;
+ add.s32 %r444, %r46, %r443;
+ mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
- mov.u32 %r746, %r766;
+ mov.u32 %r740, %r760;
$L__BB0_95:
- shr.u32 %r71, %r746, 1;
- setp.ge.u32 %p84, %r9, %r71;
+ shr.u32 %r70, %r740, 1;
+ setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
- mad.lo.s32 %r446, %r71, %r3, %r47;
- mul.wide.s32 %rd114, %r446, 4;
+ mad.lo.s32 %r445, %r70, %r3, %r46;
+ mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
- setp.gt.u32 %p85, %r746, 7;
- mov.u32 %r746, %r71;
+ setp.gt.u32 %p85, %r740, 7;
+ mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
- mov.u32 %r747, 0;
+ mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@@ -1231,54 +1229,54 @@
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
- mov.b32 %r747, %f748;
+ mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
- shl.b32 %r448, %r3, %r48;
- add.s32 %r449, %r47, %r448;
- mul.wide.s32 %rd117, %r449, 4;
+ shl.b32 %r447, %r3, %r47;
+ add.s32 %r448, %r46, %r447;
+ mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
- mov.u32 %r748, %r766;
+ mov.u32 %r742, %r760;
$L__BB0_106:
- shr.u32 %r75, %r748, 1;
- setp.ge.u32 %p90, %r9, %r75;
+ shr.u32 %r74, %r742, 1;
+ setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
- mad.lo.s32 %r450, %r75, %r3, %r47;
- mul.wide.s32 %rd120, %r450, 4;
+ mad.lo.s32 %r449, %r74, %r3, %r46;
+ mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
- setp.gt.u32 %p91, %r748, 7;
- mov.u32 %r748, %r75;
+ setp.gt.u32 %p91, %r742, 7;
+ mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
- mov.u32 %r749, 0;
+ mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@@ -1286,54 +1284,54 @@
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
- mov.b32 %r749, %f749;
+ mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
- shl.b32 %r452, %r3, %r48;
- add.s32 %r453, %r47, %r452;
- mul.wide.s32 %rd123, %r453, 4;
+ shl.b32 %r451, %r3, %r47;
+ add.s32 %r452, %r46, %r451;
+ mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
- mov.u32 %r750, %r766;
+ mov.u32 %r744, %r760;
$L__BB0_117:
- shr.u32 %r79, %r750, 1;
- setp.ge.u32 %p96, %r9, %r79;
+ shr.u32 %r78, %r744, 1;
+ setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
- mad.lo.s32 %r454, %r79, %r3, %r47;
- mul.wide.s32 %rd126, %r454, 4;
+ mad.lo.s32 %r453, %r78, %r3, %r46;
+ mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
- setp.gt.u32 %p97, %r750, 7;
- mov.u32 %r750, %r79;
+ setp.gt.u32 %p97, %r744, 7;
+ mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
- mov.u32 %r751, 0;
+ mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@@ -1341,55 +1339,55 @@
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
- mov.b32 %r751, %f750;
+ mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
- shl.b32 %r82, %r735, 4;
+ shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
- shl.b32 %r456, %r3, %r48;
- add.s32 %r457, %r47, %r456;
- mul.wide.s32 %rd129, %r457, 4;
+ shl.b32 %r455, %r3, %r47;
+ add.s32 %r456, %r46, %r455;
+ mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
- mov.u32 %r752, %r766;
+ mov.u32 %r746, %r760;
$L__BB0_128:
- shr.u32 %r84, %r752, 1;
- setp.ge.u32 %p102, %r9, %r84;
+ shr.u32 %r83, %r746, 1;
+ setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
- mad.lo.s32 %r458, %r84, %r3, %r47;
- mul.wide.s32 %rd132, %r458, 4;
+ mad.lo.s32 %r457, %r83, %r3, %r46;
+ mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
- setp.gt.u32 %p103, %r752, 7;
- mov.u32 %r752, %r84;
+ setp.gt.u32 %p103, %r746, 7;
+ mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
- mov.u32 %r753, 0;
+ mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@@ -1397,54 +1395,54 @@
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
- mov.b32 %r753, %f751;
+ mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
- shl.b32 %r460, %r3, %r48;
- add.s32 %r461, %r47, %r460;
- mul.wide.s32 %rd135, %r461, 4;
+ shl.b32 %r459, %r3, %r47;
+ add.s32 %r460, %r46, %r459;
+ mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
- mov.u32 %r754, %r766;
+ mov.u32 %r748, %r760;
$L__BB0_139:
- shr.u32 %r88, %r754, 1;
- setp.ge.u32 %p108, %r9, %r88;
+ shr.u32 %r87, %r748, 1;
+ setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
- mad.lo.s32 %r462, %r88, %r3, %r47;
- mul.wide.s32 %rd138, %r462, 4;
+ mad.lo.s32 %r461, %r87, %r3, %r46;
+ mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
- setp.gt.u32 %p109, %r754, 7;
- mov.u32 %r754, %r88;
+ setp.gt.u32 %p109, %r748, 7;
+ mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
- mov.u32 %r755, 0;
+ mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@@ -1452,54 +1450,54 @@
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
- mov.b32 %r755, %f752;
+ mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
- shl.b32 %r464, %r3, %r48;
- add.s32 %r465, %r47, %r464;
- mul.wide.s32 %rd141, %r465, 4;
+ shl.b32 %r463, %r3, %r47;
+ add.s32 %r464, %r46, %r463;
+ mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
- mov.u32 %r756, %r766;
+ mov.u32 %r750, %r760;
$L__BB0_150:
- shr.u32 %r92, %r756, 1;
- setp.ge.u32 %p114, %r9, %r92;
+ shr.u32 %r91, %r750, 1;
+ setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
- mad.lo.s32 %r466, %r92, %r3, %r47;
- mul.wide.s32 %rd144, %r466, 4;
+ mad.lo.s32 %r465, %r91, %r3, %r46;
+ mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
- setp.gt.u32 %p115, %r756, 7;
- mov.u32 %r756, %r92;
+ setp.gt.u32 %p115, %r750, 7;
+ mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
- mov.u32 %r757, 0;
+ mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@@ -1507,54 +1505,54 @@
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
- mov.b32 %r757, %f753;
+ mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
- shl.b32 %r468, %r3, %r48;
- add.s32 %r469, %r47, %r468;
- mul.wide.s32 %rd147, %r469, 4;
+ shl.b32 %r467, %r3, %r47;
+ add.s32 %r468, %r46, %r467;
+ mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
- mov.u32 %r758, %r766;
+ mov.u32 %r752, %r760;
$L__BB0_161:
- shr.u32 %r96, %r758, 1;
- setp.ge.u32 %p120, %r9, %r96;
+ shr.u32 %r95, %r752, 1;
+ setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
- mad.lo.s32 %r470, %r96, %r3, %r47;
- mul.wide.s32 %rd150, %r470, 4;
+ mad.lo.s32 %r469, %r95, %r3, %r46;
+ mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
- setp.gt.u32 %p121, %r758, 7;
- mov.u32 %r758, %r96;
+ setp.gt.u32 %p121, %r752, 7;
+ mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
- mov.u32 %r759, 0;
+ mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@@ -1562,54 +1560,54 @@
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
- mov.b32 %r759, %f754;
+ mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
- shl.b32 %r472, %r3, %r48;
- add.s32 %r473, %r47, %r472;
- mul.wide.s32 %rd153, %r473, 4;
+ shl.b32 %r471, %r3, %r47;
+ add.s32 %r472, %r46, %r471;
+ mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
- mov.u32 %r760, %r766;
+ mov.u32 %r754, %r760;
$L__BB0_172:
- shr.u32 %r100, %r760, 1;
- setp.ge.u32 %p126, %r9, %r100;
+ shr.u32 %r99, %r754, 1;
+ setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
- mad.lo.s32 %r474, %r100, %r3, %r47;
- mul.wide.s32 %rd156, %r474, 4;
+ mad.lo.s32 %r473, %r99, %r3, %r46;
+ mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
- setp.gt.u32 %p127, %r760, 7;
- mov.u32 %r760, %r100;
+ setp.gt.u32 %p127, %r754, 7;
+ mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
- mov.u32 %r761, 0;
+ mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@@ -1617,54 +1615,54 @@
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
- mov.b32 %r761, %f755;
+ mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
- shl.b32 %r476, %r3, %r48;
- add.s32 %r477, %r47, %r476;
- mul.wide.s32 %rd159, %r477, 4;
+ shl.b32 %r475, %r3, %r47;
+ add.s32 %r476, %r46, %r475;
+ mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
- mov.u32 %r762, %r766;
+ mov.u32 %r756, %r760;
$L__BB0_183:
- shr.u32 %r104, %r762, 1;
- setp.ge.u32 %p132, %r9, %r104;
+ shr.u32 %r103, %r756, 1;
+ setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
- mad.lo.s32 %r478, %r104, %r3, %r47;
- mul.wide.s32 %rd162, %r478, 4;
+ mad.lo.s32 %r477, %r103, %r3, %r46;
+ mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
- setp.gt.u32 %p133, %r762, 7;
- mov.u32 %r762, %r104;
+ setp.gt.u32 %p133, %r756, 7;
+ mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
- mov.u32 %r763, 0;
+ mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@@ -1672,54 +1670,54 @@
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
- mov.b32 %r763, %f756;
+ mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
- shl.b32 %r480, %r3, %r48;
- add.s32 %r481, %r47, %r480;
- mul.wide.s32 %rd165, %r481, 4;
+ shl.b32 %r479, %r3, %r47;
+ add.s32 %r480, %r46, %r479;
+ mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
- mov.u32 %r764, %r766;
+ mov.u32 %r758, %r760;
$L__BB0_194:
- shr.u32 %r108, %r764, 1;
- setp.ge.u32 %p138, %r9, %r108;
+ shr.u32 %r107, %r758, 1;
+ setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
- mad.lo.s32 %r482, %r108, %r3, %r47;
- mul.wide.s32 %rd168, %r482, 4;
+ mad.lo.s32 %r481, %r107, %r3, %r46;
+ mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
- setp.gt.u32 %p139, %r764, 7;
- mov.u32 %r764, %r108;
+ setp.gt.u32 %p139, %r758, 7;
+ mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
- mov.u32 %r765, 0;
+ mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@@ -1727,21 +1725,21 @@
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
- mov.b32 %r765, %f757;
+ mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
- shl.b32 %r484, %r3, %r48;
- add.s32 %r485, %r47, %r484;
- mul.wide.s32 %rd171, %r485, 4;
+ shl.b32 %r483, %r3, %r47;
+ add.s32 %r484, %r46, %r483;
+ mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
@@ -1749,30 +1747,30 @@
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
- shr.u32 %r112, %r766, 1;
- setp.ge.u32 %p144, %r9, %r112;
+ shr.u32 %r111, %r760, 1;
+ setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
- mad.lo.s32 %r486, %r112, %r3, %r47;
- mul.wide.s32 %rd174, %r486, 4;
+ mad.lo.s32 %r485, %r111, %r3, %r46;
+ mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
- setp.gt.u32 %p145, %r766, 7;
- mov.u32 %r766, %r112;
+ setp.gt.u32 %p145, %r760, 7;
+ mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
- mov.u32 %r767, 0;
+ mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@@ -1780,420 +1778,416 @@
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
- mov.b32 %r767, %f758;
+ mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
- shl.b32 %r731, %r5, 3;
- mov.u32 %r512, %ctaid.y;
- mad.lo.s32 %r513, %r203, %r512, %r731;
- add.s32 %r514, %r513, %r82;
- mul.wide.s32 %rd183, %r514, 4;
+ mov.u32 %r511, %ctaid.y;
+ mad.lo.s32 %r512, %r202, %r511, %r8;
+ add.s32 %r513, %r512, %r81;
+ mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
- st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
-
- add.s32 %r515, %r514, 4;
- mul.wide.s32 %rd184, %r515, 4;
+ st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
+
+ add.s32 %r514, %r513, 4;
+ mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
- st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
bra.uni $L__BB0_218;
$L__BB0_212:
- shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
- add.s32 %r488, %r728, 3;
- sub.s32 %r115, %r488, %r203;
- mov.u32 %r489, %ctaid.y;
- mad.lo.s32 %r116, %r203, %r489, %r728;
- neg.s32 %r490, %r82;
- setp.ge.s32 %p151, %r115, %r490;
+ add.s32 %r487, %r8, 3;
+ sub.s32 %r114, %r487, %r202;
+ mov.u32 %r488, %ctaid.y;
+ mad.lo.s32 %r115, %r202, %r488, %r8;
+ neg.s32 %r489, %r81;
+ setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
- add.s32 %r495, %r116, %r82;
- mul.wide.s32 %rd178, %r495, 4;
+ add.s32 %r494, %r115, %r81;
+ mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
- st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
+ st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
$L__BB0_214:
- mov.u32 %r496, -4;
- sub.s32 %r497, %r496, %r82;
- setp.ge.s32 %p153, %r115, %r497;
+ mov.u32 %r495, -4;
+ sub.s32 %r496, %r495, %r81;
+ setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
- add.s32 %r502, %r116, %r82;
- add.s32 %r503, %r502, 4;
- mul.wide.s32 %rd180, %r503, 4;
+ add.s32 %r501, %r115, %r81;
+ add.s32 %r502, %r501, 4;
+ mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
- st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
- shl.b32 %r730, %r5, 3;
- shl.b32 %r540, %r735, 5;
- mov.u32 %r541, %ctaid.y;
- mad.lo.s32 %r542, %r203, %r541, %r730;
- add.s32 %r543, %r542, %r540;
- mul.wide.s32 %rd191, %r543, 4;
+ shl.b32 %r539, %r729, 5;
+ mov.u32 %r540, %ctaid.y;
+ mad.lo.s32 %r541, %r202, %r540, %r8;
+ add.s32 %r542, %r541, %r539;
+ mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
- st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
-
- add.s32 %r544, %r543, 4;
- mul.wide.s32 %rd192, %r544, 4;
+ st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
+
+ add.s32 %r543, %r542, 4;
+ mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
- st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
bra.uni $L__BB0_227;
$L__BB0_219:
- shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
- add.s32 %r516, %r729, 3;
- sub.s32 %r117, %r516, %r203;
- mov.u32 %r517, %ctaid.y;
- mad.lo.s32 %r118, %r203, %r517, %r729;
+ add.s32 %r515, %r8, 3;
+ sub.s32 %r116, %r515, %r202;
+ mov.u32 %r516, %ctaid.y;
+ mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
- shl.b32 %r119, %r735, 5;
- neg.s32 %r518, %r119;
- setp.ge.s32 %p160, %r117, %r518;
+ shl.b32 %r118, %r729, 5;
+ neg.s32 %r517, %r118;
+ setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
- add.s32 %r523, %r118, %r119;
- mul.wide.s32 %rd186, %r523, 4;
+ add.s32 %r522, %r117, %r118;
+ mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
+ st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
$L__BB0_222:
@%p159 bra $L__BB0_227;
- shl.b32 %r120, %r735, 5;
- mov.u32 %r524, -4;
- sub.s32 %r525, %r524, %r120;
- setp.ge.s32 %p162, %r117, %r525;
+ shl.b32 %r119, %r729, 5;
+ mov.u32 %r523, -4;
+ sub.s32 %r524, %r523, %r119;
+ setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
- add.s32 %r530, %r118, %r120;
- add.s32 %r531, %r530, 4;
- mul.wide.s32 %rd188, %r531, 4;
+ add.s32 %r529, %r117, %r119;
+ add.s32 %r530, %r529, 4;
+ mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
- st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
$L__BB0_227:
- mov.u32 %r121, %ctaid.y;
+ mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r545, %r5, %r9;
- or.b32 %r547, %r545, %r418;
- setp.ne.s32 %p164, %r547, 0;
+ or.b32 %r544, %r5, %r9;
+ or.b32 %r546, %r544, %r417;
+ setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
- mov.u32 %r548, %ctaid.x;
- mov.u32 %r549, %ctaid.z;
- mov.u32 %r550, %nctaid.x;
- mad.lo.s32 %r551, %r549, %r550, %r548;
- mul.wide.s32 %rd194, %r551, 8;
+ mov.u32 %r547, %ctaid.x;
+ mov.u32 %r548, %ctaid.z;
+ mov.u32 %r549, %nctaid.x;
+ mad.lo.s32 %r550, %r548, %r549, %r547;
+ mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
- add.s32 %r552, %r11, -1;
- setp.eq.s32 %p165, %r121, %r552;
+ add.s32 %r551, %r11, -1;
+ setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
- mov.u32 %r768, 8;
+ mov.u32 %r762, 8;
$L__BB0_230:
- nanosleep.u32 %r768;
-
- setp.lt.u32 %p167, %r768, 256;
- selp.u32 %r555, 1, 0, %p167;
- shl.b32 %r768, %r768, %r555;
+ nanosleep.u32 %r762;
+
+ setp.lt.u32 %p167, %r762, 256;
+ selp.u32 %r554, 1, 0, %p167;
+ shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN11kernelscope6kernelENS_6TensorINS_6__halfELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
- add.s32 %r557, %r203, 1;
- shr.u32 %r558, %r557, 31;
- add.s32 %r559, %r557, %r558;
- shr.s32 %r560, %r559, 1;
- add.s32 %r561, %r4, %r560;
- add.s32 %r562, %r561, -1;
- div.s32 %r563, %r562, %r4;
- add.s32 %r564, %r11, -1;
- add.s32 %r565, %r564, %r563;
- div.s32 %r124, %r565, %r11;
- add.s32 %r125, %r564, %r3;
- shl.b32 %r126, %r9, 1;
- shl.b32 %r566, %r4, 1;
- mad.lo.s32 %r129, %r566, %r121, %r126;
- or.b32 %r127, %r129, 1;
- mul.lo.s32 %r128, %r566, %r11;
- shr.u32 %r130, %r3, 5;
- mul.lo.s32 %r567, %r46, %r130;
- shr.u32 %r131, %r5, 5;
- add.s32 %r568, %r567, %r131;
- mul.wide.u32 %rd203, %r568, 4;
+ add.s32 %r556, %r202, 1;
+ shr.u32 %r557, %r556, 31;
+ add.s32 %r558, %r556, %r557;
+ shr.s32 %r559, %r558, 1;
+ add.s32 %r560, %r4, %r559;
+ add.s32 %r561, %r560, -1;
+ div.s32 %r562, %r561, %r4;
+ add.s32 %r563, %r11, -1;
+ add.s32 %r564, %r563, %r562;
+ div.s32 %r123, %r564, %r11;
+ add.s32 %r124, %r563, %r3;
+ shl.b32 %r125, %r9, 1;
+ shl.b32 %r565, %r4, 1;
+ mad.lo.s32 %r128, %r565, %r120, %r125;
+ or.b32 %r126, %r128, 1;
+ mul.lo.s32 %r127, %r565, %r11;
+ shr.u32 %r129, %r3, 5;
+ mul.lo.s32 %r566, %r45, %r129;
+ shr.u32 %r130, %r5, 5;
+ add.s32 %r567, %r566, %r130;
+ mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
- and.b32 %r132, %r5, 31;
- add.s32 %r569, %r567, %r132;
- mul.wide.u32 %rd205, %r569, 4;
+ and.b32 %r131, %r5, 31;
+ add.s32 %r568, %r566, %r131;
+ mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
- mov.u32 %r769, 0;
+ mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
- add.s32 %r769, %r769, 1;
+ add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
- setp.lt.s32 %p169, %r769, %r124;
+ setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
- div.s32 %r154, %r125, %r3;
- setp.lt.s32 %p206, %r154, 1;
+ div.s32 %r153, %r124, %r3;
+ setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
- mul.lo.s32 %r650, %r128, %r769;
- add.s32 %r155, %r127, %r650;
- add.s32 %r156, %r129, %r650;
- mov.u32 %r649, 0;
+ mul.lo.s32 %r649, %r127, %r763;
+ add.s32 %r154, %r126, %r649;
+ add.s32 %r155, %r128, %r649;
+ mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
- mov.u32 %r776, %r649;
+ mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
- setp.ge.s32 %p207, %r155, %r203;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ setp.ge.s32 %p207, %r154, %r202;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
- mad.lo.s32 %r158, %r776, %r3, %r5;
- setp.ge.s32 %p208, %r158, %r11;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ mad.lo.s32 %r157, %r770, %r3, %r5;
+ setp.ge.s32 %p208, %r157, %r11;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
- mad.lo.s32 %r657, %r158, %r203, %r156;
- mul.wide.s32 %rd211, %r657, 4;
+ mad.lo.s32 %r656, %r157, %r202, %r155;
+ mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
- ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
+ ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
$L__BB0_263:
- mov.b32 %f642, %r778;
+ mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
- mov.b32 %f643, %r777;
+ mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
- add.s32 %r776, %r776, 1;
- setp.lt.s32 %p209, %r776, %r154;
+ add.s32 %r770, %r770, 1;
+ setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
- mov.b32 %r658, %f770;
- mov.u32 %r659, 31;
- mov.u32 %r660, 16;
- mov.u32 %r661, -1;
- shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
- mov.b32 %f644, %r662;
+ mov.b32 %r657, %f770;
+ mov.u32 %r658, 31;
+ mov.u32 %r659, 16;
+ mov.u32 %r660, -1;
+ shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
+ mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
- mov.b32 %r663, %f645;
- mov.u32 %r664, 8;
- shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
- mov.b32 %f646, %r665;
+ mov.b32 %r662, %f645;
+ mov.u32 %r663, 8;
+ shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
+ mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
- mov.b32 %r666, %f647;
- mov.u32 %r667, 4;
- shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
- mov.b32 %f648, %r668;
+ mov.b32 %r665, %f647;
+ mov.u32 %r666, 4;
+ shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
+ mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
- mov.b32 %r669, %f649;
- mov.u32 %r670, 2;
- shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
- mov.b32 %f650, %r671;
+ mov.b32 %r668, %f649;
+ mov.u32 %r669, 2;
+ shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
+ mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
- mov.b32 %r672, %f651;
- mov.u32 %r673, 1;
- shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
- mov.b32 %f652, %r674;
+ mov.b32 %r671, %f651;
+ mov.u32 %r672, 1;
+ shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
+ mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
- setp.ne.s32 %p215, %r132, 0;
+ setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
- setp.ne.s32 %p216, %r131, 0;
+ setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
- setp.ge.u32 %p217, %r132, %r130;
+ setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
- mov.b32 %r675, %f771;
- mov.u32 %r676, 31;
- mov.u32 %r677, 16;
- mov.u32 %r678, -1;
- shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
- mov.b32 %f654, %r679;
+ mov.b32 %r674, %f771;
+ mov.u32 %r675, 31;
+ mov.u32 %r676, 16;
+ mov.u32 %r677, -1;
+ shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
+ mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
- mov.b32 %r680, %f655;
- mov.u32 %r681, 8;
- shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
- mov.b32 %f656, %r682;
+ mov.b32 %r679, %f655;
+ mov.u32 %r680, 8;
+ shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
+ mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
- mov.b32 %r683, %f657;
- mov.u32 %r684, 4;
- shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
- mov.b32 %f658, %r685;
+ mov.b32 %r682, %f657;
+ mov.u32 %r683, 4;
+ shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
+ mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
- mov.b32 %r686, %f659;
- mov.u32 %r687, 2;
- shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
- mov.b32 %f660, %r688;
+ mov.b32 %r685, %f659;
+ mov.u32 %r686, 2;
+ shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
+ mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
- mov.b32 %r689, %f661;
- mov.u32 %r690, 1;
- shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
- mov.b32 %f662, %r691;
+ mov.b32 %r688, %f661;
+ mov.u32 %r689, 1;
+ shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
+ mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
- setp.eq.s32 %p224, %r132, 0;
+ setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
{ cvt.rn.f16.f32 %rs131, %f663;}
- mov.b32 %r692, %f769;
- mov.u32 %r693, 31;
- mov.u32 %r694, 16;
- mov.u32 %r695, -1;
- shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
- mov.b32 %f665, %r696;
+ mov.b32 %r691, %f769;
+ mov.u32 %r692, 31;
+ mov.u32 %r693, 16;
+ mov.u32 %r694, -1;
+ shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
+ mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
- mov.b32 %r697, %f666;
- mov.u32 %r698, 8;
- shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
- mov.b32 %f667, %r699;
+ mov.b32 %r696, %f666;
+ mov.u32 %r697, 8;
+ shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
+ mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
- mov.b32 %r700, %f668;
- mov.u32 %r701, 4;
- shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
- mov.b32 %f669, %r702;
+ mov.b32 %r699, %f668;
+ mov.u32 %r700, 4;
+ shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
+ mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
- mov.b32 %r703, %f670;
- mov.u32 %r704, 2;
- shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
- mov.b32 %f671, %r705;
+ mov.b32 %r702, %f670;
+ mov.u32 %r703, 2;
+ shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
+ mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
- mov.b32 %r706, %f672;
- mov.u32 %r707, 1;
- shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
- mov.b32 %f673, %r708;
+ mov.b32 %r705, %f672;
+ mov.u32 %r706, 1;
+ shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
+ mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
- setp.ge.u32 %p231, %r132, %r130;
+ setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
- mov.b32 %r709, %f773;
- mov.u32 %r710, 31;
- mov.u32 %r711, 16;
- mov.u32 %r712, -1;
- shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
- mov.b32 %f675, %r713;
+ mov.b32 %r708, %f773;
+ mov.u32 %r709, 31;
+ mov.u32 %r710, 16;
+ mov.u32 %r711, -1;
+ shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
+ mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
- mov.b32 %r714, %f676;
- mov.u32 %r715, 8;
- shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
- mov.b32 %f677, %r716;
+ mov.b32 %r713, %f676;
+ mov.u32 %r714, 8;
+ shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
+ mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
- mov.b32 %r717, %f678;
- mov.u32 %r718, 4;
- shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
- mov.b32 %f679, %r719;
+ mov.b32 %r716, %f678;
+ mov.u32 %r717, 4;
+ shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
+ mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
- mov.b32 %r720, %f680;
- mov.u32 %r721, 2;
- shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
- mov.b32 %f681, %r722;
+ mov.b32 %r719, %f680;
+ mov.u32 %r720, 2;
+ shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
+ mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
- mov.b32 %r723, %f682;
- mov.u32 %r724, 1;
- shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
- mov.b32 %f683, %r725;
+ mov.b32 %r722, %f682;
+ mov.u32 %r723, 1;
+ shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
+ mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
@@ -2202,228 +2196,228 @@
{ cvt.rn.f16.f32 %rs132, %f684;}
@%p6 bra $L__BB0_279;
- mul.lo.s32 %r164, %r128, %r769;
- add.s32 %r726, %r127, %r164;
- setp.ge.s32 %p239, %r726, %r203;
+ mul.lo.s32 %r163, %r127, %r763;
+ add.s32 %r725, %r126, %r163;
+ setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
- add.s32 %r727, %r129, %r164;
- mul.wide.s32 %rd212, %r727, 2;
+ add.s32 %r726, %r128, %r163;
+ mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
- setp.lt.s32 %p170, %r124, 1;
+ setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
- div.s32 %r134, %r125, %r3;
- mad.lo.s32 %r135, %r203, %r5, %r126;
- shl.b32 %r136, %r121, 1;
- shl.b32 %r137, %r11, 1;
- mul.lo.s32 %r138, %r203, %r3;
- mov.u32 %r770, 0;
+ div.s32 %r133, %r124, %r3;
+ mad.lo.s32 %r134, %r202, %r5, %r125;
+ shl.b32 %r135, %r120, 1;
+ shl.b32 %r136, %r11, 1;
+ mul.lo.s32 %r137, %r202, %r3;
+ mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
- setp.lt.s32 %p171, %r134, 1;
+ setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
- mad.lo.s32 %r140, %r128, %r770, %r127;
- mad.lo.s32 %r572, %r137, %r770, %r136;
- mad.lo.s32 %r772, %r4, %r572, %r135;
- mov.u32 %r571, 0;
+ mad.lo.s32 %r139, %r127, %r764, %r126;
+ mad.lo.s32 %r571, %r136, %r764, %r135;
+ mad.lo.s32 %r766, %r4, %r571, %r134;
+ mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
- mov.u32 %r771, %r5;
- mov.u32 %r773, %r571;
+ mov.u32 %r765, %r5;
+ mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
- setp.ge.s32 %p172, %r140, %r203;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p172, %r139, %r202;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
- setp.ge.s32 %p173, %r771, %r11;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p173, %r765, %r11;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
- mul.wide.s32 %rd207, %r772, 4;
+ mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
- ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
+ ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
$L__BB0_240:
- mov.b32 %f594, %r775;
+ mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
- mov.b32 %f595, %r774;
+ mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
- add.s32 %r772, %r772, %r138;
- add.s32 %r771, %r771, %r3;
- add.s32 %r773, %r773, 1;
- setp.lt.s32 %p174, %r773, %r134;
+ add.s32 %r766, %r766, %r137;
+ add.s32 %r765, %r765, %r3;
+ add.s32 %r767, %r767, 1;
+ setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
- mov.b32 %r579, %f762;
- mov.u32 %r580, 31;
- mov.u32 %r581, 16;
- mov.u32 %r582, -1;
- shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
- mov.b32 %f596, %r583;
+ mov.b32 %r578, %f762;
+ mov.u32 %r579, 31;
+ mov.u32 %r580, 16;
+ mov.u32 %r581, -1;
+ shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
+ mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
- mov.b32 %r584, %f597;
- mov.u32 %r585, 8;
- shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
- mov.b32 %f598, %r586;
+ mov.b32 %r583, %f597;
+ mov.u32 %r584, 8;
+ shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
+ mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
- mov.b32 %r587, %f599;
- mov.u32 %r588, 4;
- shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
- mov.b32 %f600, %r589;
+ mov.b32 %r586, %f599;
+ mov.u32 %r587, 4;
+ shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
+ mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
- mov.b32 %r590, %f601;
- mov.u32 %r591, 2;
- shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
- mov.b32 %f602, %r592;
+ mov.b32 %r589, %f601;
+ mov.u32 %r590, 2;
+ shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
+ mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
- mov.b32 %r593, %f603;
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
- mov.b32 %f604, %r595;
+ mov.b32 %r592, %f603;
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
+ mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
- setp.ne.s32 %p180, %r132, 0;
+ setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
- setp.ne.s32 %p181, %r131, 0;
+ setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
- setp.ge.u32 %p182, %r132, %r130;
+ setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
- mov.b32 %r596, %f763;
- mov.u32 %r597, 31;
- mov.u32 %r598, 16;
- mov.u32 %r599, -1;
- shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
- mov.b32 %f606, %r600;
+ mov.b32 %r595, %f763;
+ mov.u32 %r596, 31;
+ mov.u32 %r597, 16;
+ mov.u32 %r598, -1;
+ shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
+ mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
- mov.b32 %r601, %f607;
- mov.u32 %r602, 8;
- shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
- mov.b32 %f608, %r603;
+ mov.b32 %r600, %f607;
+ mov.u32 %r601, 8;
+ shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
+ mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
- mov.b32 %r604, %f609;
- mov.u32 %r605, 4;
- shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
- mov.b32 %f610, %r606;
+ mov.b32 %r603, %f609;
+ mov.u32 %r604, 4;
+ shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
+ mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
- mov.b32 %r607, %f611;
- mov.u32 %r608, 2;
- shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
- mov.b32 %f612, %r609;
+ mov.b32 %r606, %f611;
+ mov.u32 %r607, 2;
+ shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
+ mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
- mov.b32 %r610, %f613;
- mov.u32 %r611, 1;
- shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
- mov.b32 %f614, %r612;
+ mov.b32 %r609, %f613;
+ mov.u32 %r610, 1;
+ shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
+ mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
- setp.eq.s32 %p189, %r132, 0;
+ setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
{ cvt.rn.f16.f32 %rs129, %f615;}
- mov.b32 %r613, %f761;
- mov.u32 %r614, 31;
- mov.u32 %r615, 16;
- mov.u32 %r616, -1;
- shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
- mov.b32 %f617, %r617;
+ mov.b32 %r612, %f761;
+ mov.u32 %r613, 31;
+ mov.u32 %r614, 16;
+ mov.u32 %r615, -1;
+ shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
+ mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
- mov.b32 %r618, %f618;
- mov.u32 %r619, 8;
- shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
- mov.b32 %f619, %r620;
+ mov.b32 %r617, %f618;
+ mov.u32 %r618, 8;
+ shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
+ mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
- mov.b32 %r621, %f620;
- mov.u32 %r622, 4;
- shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
- mov.b32 %f621, %r623;
+ mov.b32 %r620, %f620;
+ mov.u32 %r621, 4;
+ shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
+ mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
- mov.b32 %r624, %f622;
- mov.u32 %r625, 2;
- shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
- mov.b32 %f623, %r626;
+ mov.b32 %r623, %f622;
+ mov.u32 %r624, 2;
+ shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
+ mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
- mov.b32 %r627, %f624;
- mov.u32 %r628, 1;
- shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
- mov.b32 %f625, %r629;
+ mov.b32 %r626, %f624;
+ mov.u32 %r627, 1;
+ shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
+ mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
- setp.ge.u32 %p196, %r132, %r130;
+ setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
- mov.b32 %r630, %f765;
- mov.u32 %r631, 31;
- mov.u32 %r632, 16;
- mov.u32 %r633, -1;
- shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
- mov.b32 %f627, %r634;
+ mov.b32 %r629, %f765;
+ mov.u32 %r630, 31;
+ mov.u32 %r631, 16;
+ mov.u32 %r632, -1;
+ shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
+ mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
- mov.b32 %r635, %f628;
- mov.u32 %r636, 8;
- shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
- mov.b32 %f629, %r637;
+ mov.b32 %r634, %f628;
+ mov.u32 %r635, 8;
+ shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
+ mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
- mov.b32 %r638, %f630;
- mov.u32 %r639, 4;
- shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
- mov.b32 %f631, %r640;
+ mov.b32 %r637, %f630;
+ mov.u32 %r638, 4;
+ shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
+ mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
- mov.b32 %r641, %f632;
- mov.u32 %r642, 2;
- shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
- mov.b32 %f633, %r643;
+ mov.b32 %r640, %f632;
+ mov.u32 %r641, 2;
+ shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
+ mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
- mov.b32 %r644, %f634;
- mov.u32 %r645, 1;
- shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
- mov.b32 %f635, %r646;
+ mov.b32 %r643, %f634;
+ mov.u32 %r644, 1;
+ shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
+ mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
@@ -2432,23 +2426,23 @@
{ cvt.rn.f16.f32 %rs130, %f636;}
@%p6 bra $L__BB0_256;
- mul.lo.s32 %r152, %r128, %r770;
- add.s32 %r647, %r127, %r152;
- setp.ge.s32 %p204, %r647, %r203;
+ mul.lo.s32 %r151, %r127, %r764;
+ add.s32 %r646, %r126, %r151;
+ setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
- add.s32 %r648, %r129, %r152;
- mul.wide.s32 %rd208, %r648, 2;
+ add.s32 %r647, %r128, %r151;
+ mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
- add.s32 %r770, %r770, 1;
- setp.lt.s32 %p205, %r770, %r124;
+ add.s32 %r764, %r764, 1;
+ setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
18: CombinedSchedulerTest.LayerNormBackward/dtype___half_batch_216_hidden_65536
Kernel 3
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-9
+9 index type: int
registers: 40
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T18, Tensor<float, 1, 1> T15, Tensor<float, 2, 2> T14, Tensor<float, 2, 2> T11, Tensor<__half, 2, 2> T28) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
float* T34 = reinterpret_cast<float*>(array + smem_offset + 4608);
float* T35 = reinterpret_cast<float*>(array + smem_offset + 4096);
float* T33 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T11.data;
s0.logical_size = T11.logical_size;
s0.alloc_stride = T11.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 8) * 4) + 3) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T33[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__half, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
= T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
T23[0]
= (float) d5
* T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2half(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T33[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__half, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T16[0]
= T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T23[0]
= (float) d5
* T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2half(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T18, Tensor<float, 1, 1> T15, Tensor<float, 2, 2> T14, Tensor<float, 2, 2> T11, Tensor<__half, 2, 2> T28) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
float* T34 = reinterpret_cast<float*>(array + smem_offset + 4608);
float* T35 = reinterpret_cast<float*>(array + smem_offset + 4096);
float* T33 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T11.data;
s0.logical_size = T11.logical_size;
s0.alloc_stride = T11.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__half, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
= T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
T23[0]
= (float) d5
* T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2half(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__half, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T16[0]
= T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T23[0]
= (float) d5
* T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
}
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2half(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<__half, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -17,11 +17,11 @@
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
- if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 8) * 4) + 3) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
+ if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
@@ -46,11 +46,11 @@
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
- T33[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__half, 8, 4> T38;
__barrier_sync(0);
@@ -59,19 +59,19 @@
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
- = T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
- = T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
@@ -80,11 +80,11 @@
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
T23[0]
= (float) d5
- * T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ * T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
@@ -133,11 +133,11 @@
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
- T33[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__half, 8, 4> T38;
@@ -148,21 +148,21 @@
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T16[0]
- = T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
- = T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
@@ -173,11 +173,11 @@
Array<float, 1, 1> T23;
T23[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T23[0]
= (float) d5
- * T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ * T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
}
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_1[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_2[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5[24]
)
{
.reg .pred %p<48>;
.reg .b16 %rs<29>;
.reg .f32 %f<257>;
.reg .b32 %r<243>;
.reg .f64 %fd<3>;
.reg .b64 %rd<61>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0+16];
ld.param.v2.u32 {%r66, %r67}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4+8];
ld.param.u64 %rd13, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5];
ld.param.u64 %rd12, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_1];
ld.param.u64 %rd11, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_3];
ld.param.u64 %rd14, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_2];
cvta.to.global.u64 %rd1, %rd11;
cvta.to.global.u64 %rd2, %rd14;
ld.param.u64 %rd4, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd15, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r73, [%rd15], %r2;
ld.shared.u32 %r4, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_1033910nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r67;
rcp.rn.f64 %fd1, %fd2;
mul.wide.s32 %rd16, %r2, 4;
mov.u64 %rd17, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_3aaa54bc_103395arrayE;
add.s64 %rd6, %rd17, %rd16;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r74, %r67, 31;
shr.s32 %r75, %r74, 31;
shr.u32 %r76, %r75, 27;
add.s32 %r77, %r74, %r76;
shr.s32 %r6, %r77, 5;
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd18, %rd12;
mul.wide.s32 %rd19, %r8, 4;
add.s64 %rd8, %rd18, %rd19;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
add.s32 %r82, %r9, %r7;
add.s32 %r83, %r82, 16;
setp.gt.s32 %p4, %r83, 215;
@%p4 bra $L__BB0_7;
and.b32 %r87, %r81, 1073741816;
sub.s32 %r88, %r2, %r87;
shl.b32 %r10, %r88, 2;
rem.s32 %r89, %r5, %r6;
shl.b32 %r11, %r89, 5;
add.s32 %r90, %r10, %r11;
or.b32 %r91, %r90, 3;
setp.ge.s32 %p5, %r91, %r67;
@%p5 bra $L__BB0_7;
shr.u32 %r93, %r79, 27;
add.s32 %r94, %r2, %r93;
and.b32 %r95, %r94, -32;
sub.s32 %r12, %r2, %r95;
add.s32 %r13, %r7, %r12;
setp.lt.s32 %p6, %r13, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f165, [%rd8];
st.shared.f32 [%rd6+4608], %f165;
shl.b32 %r212, %r4, 5;
add.s32 %r213, %r7, %r9;
add.s32 %r214, %r213, %r212;
mad.lo.s32 %r215, %r214, %r67, %r10;
add.s32 %r216, %r215, %r11;
mul.wide.s32 %rd46, %r216, 4;
add.s64 %rd40, %rd4, %rd46;
// begin inline asm
ld.global.cs.v4.u32 {%r192,%r193,%r194,%r195}, [%rd40];
// end inline asm
add.s32 %r217, %r214, 16;
mad.lo.s32 %r218, %r217, %r67, %r10;
add.s32 %r219, %r218, %r11;
mul.wide.s32 %rd47, %r219, 4;
add.s64 %rd41, %rd4, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r196,%r197,%r198,%r199}, [%rd41];
// end inline asm
shl.b64 %rd48, %rd7, 2;
add.s64 %rd49, %rd2, %rd48;
ld.global.f32 %f166, [%rd49];
st.shared.f32 [%rd6+4096], %f166;
mul.lo.s32 %r220, %r4, 96;
add.s32 %r221, %r214, %r220;
mad.lo.s32 %r222, %r221, %r67, %r10;
add.s32 %r223, %r222, %r11;
mul.wide.s32 %rd50, %r223, 4;
add.s64 %rd42, %rd3, %rd50;
// begin inline asm
ld.global.cs.v4.u32 {%r200,%r201,%r202,%r203}, [%rd42];
// end inline asm
add.s32 %r224, %r217, %r220;
mad.lo.s32 %r225, %r224, %r67, %r10;
add.s32 %r226, %r225, %r11;
mul.wide.s32 %rd51, %r226, 4;
add.s64 %rd43, %rd3, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r204,%r205,%r206,%r207}, [%rd43];
// end inline asm
mul.lo.s32 %r227, %r13, %r62;
mul.wide.s32 %rd52, %r227, 4;
add.s64 %rd53, %rd1, %rd52;
mul.wide.s32 %rd54, %r12, 4;
add.s64 %rd56, %rd17, %rd54;
ld.global.f32 %f167, [%rd53];
st.shared.f32 [%rd56], %f167;
barrier.sync 0;
mul.wide.s32 %rd57, %r9, 4;
add.s64 %rd58, %rd17, %rd57;
ld.shared.f32 %f168, [%rd58];
cvt.rn.f32.f64 %f169, %fd1;
mul.f32 %f170, %f168, %f169;
mov.b32 %f171, %r200;
ld.shared.f32 %f172, [%rd58+4096];
sub.f32 %f173, %f171, %f172;
mov.b32 %f174, %r192;
ld.shared.f32 %f175, [%rd58+4608];
mul.f32 %f176, %f175, %f174;
sub.f32 %f177, %f173, %f176;
mul.f32 %f157, %f170, %f177;
mov.b32 %f178, %r201;
sub.f32 %f179, %f178, %f172;
mov.b32 %f180, %r193;
mul.f32 %f181, %f175, %f180;
sub.f32 %f182, %f179, %f181;
mul.f32 %f158, %f170, %f182;
// begin inline asm
{ cvt.rn.f16.f32 %rs22, %f158;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs21, %f157;}
// end inline asm
mov.b32 %r208, {%rs21, %rs22};
mov.b32 %f183, %r202;
sub.f32 %f184, %f183, %f172;
mov.b32 %f185, %r194;
mul.f32 %f186, %f175, %f185;
sub.f32 %f187, %f184, %f186;
mul.f32 %f159, %f170, %f187;
mov.b32 %f188, %r203;
sub.f32 %f189, %f188, %f172;
mov.b32 %f190, %r195;
mul.f32 %f191, %f175, %f190;
sub.f32 %f192, %f189, %f191;
mul.f32 %f160, %f170, %f192;
// begin inline asm
{ cvt.rn.f16.f32 %rs24, %f160;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs23, %f159;}
// end inline asm
mov.b32 %r209, {%rs23, %rs24};
ld.shared.f32 %f193, [%rd58+64];
mul.f32 %f194, %f193, %f169;
mov.b32 %f195, %r204;
ld.shared.f32 %f196, [%rd58+4160];
sub.f32 %f197, %f195, %f196;
mov.b32 %f198, %r196;
ld.shared.f32 %f199, [%rd58+4672];
mul.f32 %f200, %f199, %f198;
sub.f32 %f201, %f197, %f200;
mul.f32 %f161, %f194, %f201;
mov.b32 %f202, %r205;
sub.f32 %f203, %f202, %f196;
mov.b32 %f204, %r197;
mul.f32 %f205, %f199, %f204;
sub.f32 %f206, %f203, %f205;
mul.f32 %f162, %f194, %f206;
// begin inline asm
{ cvt.rn.f16.f32 %rs26, %f162;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs25, %f161;}
// end inline asm
mov.b32 %r210, {%rs25, %rs26};
mov.b32 %f207, %r206;
sub.f32 %f208, %f207, %f196;
mov.b32 %f209, %r198;
mul.f32 %f210, %f199, %f209;
sub.f32 %f211, %f208, %f210;
mul.f32 %f163, %f194, %f211;
mov.b32 %f212, %r207;
sub.f32 %f213, %f212, %f196;
mov.b32 %f214, %r199;
mul.f32 %f215, %f199, %f214;
sub.f32 %f216, %f213, %f215;
mul.f32 %f164, %f194, %f216;
// begin inline asm
{ cvt.rn.f16.f32 %rs28, %f164;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs27, %f163;}
// end inline asm
mov.b32 %r211, {%rs27, %rs28};
mul.lo.s32 %r228, %r4, 896;
add.s32 %r229, %r221, %r228;
mad.lo.s32 %r230, %r229, %r67, %r10;
add.s32 %r231, %r230, %r11;
mul.wide.s32 %rd59, %r231, 2;
add.s64 %rd44, %rd13, %rd59;
// begin inline asm
st.global.cs.v2.s32 [%rd44], {%r208,%r209};
// end inline asm
add.s32 %r232, %r224, %r228;
mad.lo.s32 %r233, %r232, %r67, %r10;
add.s32 %r234, %r233, %r11;
mul.wide.s32 %rd60, %r234, 2;
add.s64 %rd45, %rd13, %rd60;
// begin inline asm
st.global.cs.v2.s32 [%rd45], {%r210,%r211};
// end inline asm
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f82, [%rd8];
st.shared.f32 [%rd6+4608], %f82;
$L__BB0_9:
mov.u32 %r14, %ctaid.x;
add.s32 %r100, %r67, 31;
shr.s32 %r101, %r100, 31;
shr.u32 %r102, %r101, 27;
add.s32 %r103, %r100, %r102;
shr.s32 %r15, %r103, 5;
shl.b32 %r16, %r4, 5;
shr.s32 %r104, %r2, 31;
shr.u32 %r105, %r104, 29;
add.s32 %r106, %r2, %r105;
and.b32 %r107, %r106, 1073741816;
sub.s32 %r108, %r2, %r107;
shl.b32 %r109, %r108, 2;
rem.s32 %r110, %r14, %r15;
shl.b32 %r111, %r110, 5;
add.s32 %r20, %r111, %r109;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r67;
shr.s32 %r18, %r106, 3;
add.s32 %r19, %r18, -216;
mov.u32 %r235, 0;
mov.u32 %r236, %r235;
mov.u32 %r237, %r235;
mov.u32 %r238, %r235;
@%p8 bra $L__BB0_12;
div.s32 %r116, %r14, %r15;
shl.b32 %r21, %r116, 5;
add.s32 %r117, %r19, %r21;
neg.s32 %r118, %r16;
setp.ge.s32 %p9, %r117, %r118;
@%p9 bra $L__BB0_12;
add.s32 %r123, %r16, %r18;
add.s32 %r124, %r123, %r21;
mad.lo.s32 %r125, %r124, %r67, %r20;
mul.wide.s32 %rd21, %r125, 4;
add.s64 %rd20, %rd4, %rd21;
// begin inline asm
ld.global.cs.v4.u32 {%r238,%r237,%r236,%r235}, [%rd20];
// end inline asm
$L__BB0_12:
mov.f32 %f225, 0f00000000;
mov.f32 %f226, 0f00000000;
mov.f32 %f227, 0f00000000;
mov.f32 %f228, 0f00000000;
@%p8 bra $L__BB0_15;
div.s32 %r126, %r14, %r15;
shl.b32 %r30, %r126, 5;
add.s32 %r127, %r19, %r30;
mov.u32 %r128, -16;
sub.s32 %r129, %r128, %r16;
setp.ge.s32 %p11, %r127, %r129;
@%p11 bra $L__BB0_15;
add.s32 %r134, %r16, %r18;
add.s32 %r135, %r134, %r30;
add.s32 %r136, %r135, 16;
mad.lo.s32 %r137, %r136, %r67, %r20;
mul.wide.s32 %rd23, %r137, 4;
add.s64 %rd22, %rd4, %rd23;
// begin inline asm
ld.global.cs.v4.u32 {%r130,%r131,%r132,%r133}, [%rd22];
// end inline asm
mov.b32 %f228, %r130;
mov.b32 %f227, %r131;
mov.b32 %f226, %r132;
mov.b32 %f225, %r133;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
div.s32 %r138, %r14, %r15;
shl.b32 %r139, %r138, 5;
add.s32 %r32, %r139, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
mul.wide.s32 %rd24, %r32, 4;
add.s64 %rd25, %rd2, %rd24;
ld.global.f32 %f91, [%rd25];
st.shared.f32 [%rd6+4096], %f91;
$L__BB0_18:
mov.u32 %r239, 0;
mov.u32 %r240, %r239;
mov.u32 %r241, %r239;
mov.u32 %r242, %r239;
@%p8 bra $L__BB0_21;
div.s32 %r148, %r14, %r15;
shl.b32 %r33, %r148, 5;
add.s32 %r149, %r19, %r33;
neg.s32 %r150, %r31;
setp.ge.s32 %p15, %r149, %r150;
@%p15 bra $L__BB0_21;
add.s32 %r155, %r31, %r18;
add.s32 %r156, %r155, %r33;
mad.lo.s32 %r157, %r156, %r67, %r20;
mul.wide.s32 %rd27, %r157, 4;
add.s64 %rd26, %rd3, %rd27;
// begin inline asm
ld.global.cs.v4.u32 {%r242,%r241,%r240,%r239}, [%rd26];
// end inline asm
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r67;
mov.f32 %f229, 0f00000000;
mov.f32 %f230, 0f00000000;
mov.f32 %f231, 0f00000000;
mov.f32 %f232, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
div.s32 %r158, %r14, %r15;
shl.b32 %r42, %r158, 5;
add.s32 %r159, %r19, %r42;
mov.u32 %r160, -16;
sub.s32 %r161, %r160, %r31;
setp.ge.s32 %p17, %r159, %r161;
@%p17 bra $L__BB0_24;
add.s32 %r166, %r31, %r18;
add.s32 %r167, %r166, %r42;
add.s32 %r168, %r167, 16;
mad.lo.s32 %r169, %r168, %r67, %r20;
mul.wide.s32 %rd29, %r169, 4;
add.s64 %rd28, %rd3, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r162,%r163,%r164,%r165}, [%rd28];
// end inline asm
mov.b32 %f232, %r162;
mov.b32 %f231, %r163;
mov.b32 %f230, %r164;
mov.b32 %f229, %r165;
$L__BB0_24:
div.s32 %r170, %r14, %r15;
shl.b32 %r43, %r170, 5;
shr.u32 %r172, %r104, 27;
add.s32 %r173, %r2, %r172;
and.b32 %r174, %r173, -32;
sub.s32 %r44, %r2, %r174;
add.s32 %r175, %r43, %r44;
setp.gt.s32 %p18, %r175, 215;
mul.lo.s32 %r176, %r175, %r62;
mul.wide.s32 %rd30, %r176, 4;
add.s64 %rd9, %rd1, %rd30;
@%p18 bra $L__BB0_26;
mul.wide.s32 %rd31, %r44, 4;
add.s64 %rd33, %rd17, %rd31;
ld.global.f32 %f100, [%rd9];
st.shared.f32 [%rd33], %f100;
$L__BB0_26:
shl.b32 %r45, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
neg.s32 %r46, %r45;
add.s32 %r47, %r19, %r43;
setp.ge.s32 %p19, %r47, %r46;
mul.wide.s32 %rd34, %r18, 4;
add.s64 %rd10, %rd17, %rd34;
mov.f32 %f234, 0f00000000;
mov.f32 %f233, %f234;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f233, [%rd10+4096];
$L__BB0_28:
mov.b32 %f103, %r242;
sub.f32 %f20, %f103, %f233;
@%p19 bra $L__BB0_30;
ld.shared.f32 %f234, [%rd10+4608];
$L__BB0_30:
mov.b32 %f105, %r238;
mul.f32 %f106, %f234, %f105;
sub.f32 %f23, %f20, %f106;
mov.f32 %f236, 0f00000000;
mov.f32 %f235, %f236;
@%p19 bra $L__BB0_32;
ld.shared.f32 %f107, [%rd10];
mul.f32 %f235, %f107, %f17;
$L__BB0_32:
mul.f32 %f108, %f23, %f235;
// begin inline asm
{ cvt.rn.f16.f32 %rs13, %f108;}
// end inline asm
@%p19 bra $L__BB0_34;
ld.shared.f32 %f236, [%rd10+4096];
$L__BB0_34:
mov.b32 %f111, %r241;
sub.f32 %f28, %f111, %f236;
mov.f32 %f238, 0f00000000;
mov.f32 %f237, %f238;
@%p19 bra $L__BB0_36;
ld.shared.f32 %f237, [%rd10+4608];
$L__BB0_36:
mov.b32 %f113, %r237;
mul.f32 %f114, %f237, %f113;
sub.f32 %f31, %f28, %f114;
@%p19 bra $L__BB0_38;
ld.shared.f32 %f115, [%rd10];
mul.f32 %f238, %f115, %f17;
$L__BB0_38:
mul.f32 %f116, %f31, %f238;
// begin inline asm
{ cvt.rn.f16.f32 %rs14, %f116;}
// end inline asm
mov.f32 %f240, 0f00000000;
mov.f32 %f239, %f240;
@%p19 bra $L__BB0_40;
ld.shared.f32 %f239, [%rd10+4096];
$L__BB0_40:
mov.b32 %f119, %r240;
sub.f32 %f36, %f119, %f239;
@%p19 bra $L__BB0_42;
ld.shared.f32 %f240, [%rd10+4608];
$L__BB0_42:
mov.b32 %f121, %r236;
mul.f32 %f122, %f240, %f121;
sub.f32 %f39, %f36, %f122;
mov.f32 %f242, 0f00000000;
mov.f32 %f241, %f242;
@%p19 bra $L__BB0_44;
ld.shared.f32 %f123, [%rd10];
mul.f32 %f241, %f123, %f17;
$L__BB0_44:
mul.f32 %f124, %f39, %f241;
// begin inline asm
{ cvt.rn.f16.f32 %rs15, %f124;}
// end inline asm
@%p19 bra $L__BB0_46;
ld.shared.f32 %f242, [%rd10+4096];
$L__BB0_46:
mov.b32 %f127, %r239;
sub.f32 %f44, %f127, %f242;
mov.f32 %f244, 0f00000000;
mov.f32 %f243, %f244;
@%p19 bra $L__BB0_48;
ld.shared.f32 %f243, [%rd10+4608];
$L__BB0_48:
mov.b32 %f129, %r235;
mul.f32 %f130, %f243, %f129;
sub.f32 %f47, %f44, %f130;
@%p19 bra $L__BB0_50;
ld.shared.f32 %f131, [%rd10];
mul.f32 %f244, %f131, %f17;
$L__BB0_50:
mul.f32 %f132, %f47, %f244;
// begin inline asm
{ cvt.rn.f16.f32 %rs16, %f132;}
// end inline asm
mov.u32 %r177, -16;
sub.s32 %r48, %r177, %r45;
setp.ge.s32 %p31, %r47, %r48;
mov.f32 %f246, 0f00000000;
mov.f32 %f245, %f246;
@%p31 bra $L__BB0_52;
ld.shared.f32 %f245, [%rd10+4160];
$L__BB0_52:
sub.f32 %f52, %f232, %f245;
@%p31 bra $L__BB0_54;
ld.shared.f32 %f246, [%rd10+4672];
$L__BB0_54:
mul.f32 %f136, %f246, %f228;
sub.f32 %f55, %f52, %f136;
mov.f32 %f248, 0f00000000;
mov.f32 %f247, %f248;
@%p31 bra $L__BB0_56;
ld.shared.f32 %f137, [%rd10+64];
mul.f32 %f247, %f137, %f17;
$L__BB0_56:
mul.f32 %f138, %f55, %f247;
// begin inline asm
{ cvt.rn.f16.f32 %rs17, %f138;}
// end inline asm
@%p31 bra $L__BB0_58;
ld.shared.f32 %f248, [%rd10+4160];
$L__BB0_58:
sub.f32 %f60, %f231, %f248;
mov.f32 %f250, 0f00000000;
mov.f32 %f249, %f250;
@%p31 bra $L__BB0_60;
ld.shared.f32 %f249, [%rd10+4672];
$L__BB0_60:
mul.f32 %f142, %f249, %f227;
sub.f32 %f63, %f60, %f142;
@%p31 bra $L__BB0_62;
ld.shared.f32 %f143, [%rd10+64];
mul.f32 %f250, %f143, %f17;
$L__BB0_62:
mul.f32 %f144, %f63, %f250;
// begin inline asm
{ cvt.rn.f16.f32 %rs18, %f144;}
// end inline asm
mov.f32 %f252, 0f00000000;
mov.f32 %f251, %f252;
@%p31 bra $L__BB0_64;
ld.shared.f32 %f251, [%rd10+4160];
$L__BB0_64:
sub.f32 %f68, %f230, %f251;
@%p31 bra $L__BB0_66;
ld.shared.f32 %f252, [%rd10+4672];
$L__BB0_66:
mul.f32 %f148, %f252, %f226;
sub.f32 %f71, %f68, %f148;
mov.f32 %f254, 0f00000000;
mov.f32 %f253, %f254;
@%p31 bra $L__BB0_68;
ld.shared.f32 %f149, [%rd10+64];
mul.f32 %f253, %f149, %f17;
$L__BB0_68:
mul.f32 %f150, %f71, %f253;
// begin inline asm
{ cvt.rn.f16.f32 %rs19, %f150;}
// end inline asm
@%p31 bra $L__BB0_70;
ld.shared.f32 %f254, [%rd10+4160];
$L__BB0_70:
sub.f32 %f76, %f229, %f254;
mov.f32 %f256, 0f00000000;
mov.f32 %f255, %f256;
@%p31 bra $L__BB0_72;
ld.shared.f32 %f255, [%rd10+4672];
$L__BB0_72:
mul.f32 %f154, %f255, %f225;
sub.f32 %f79, %f76, %f154;
@%p31 bra $L__BB0_74;
ld.shared.f32 %f155, [%rd10+64];
mul.f32 %f256, %f155, %f17;
$L__BB0_74:
mul.f32 %f156, %f79, %f256;
// begin inline asm
{ cvt.rn.f16.f32 %rs20, %f156;}
// end inline asm
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
neg.s32 %r178, %r49;
setp.ge.s32 %p44, %r47, %r178;
@%p44 bra $L__BB0_77;
add.s32 %r181, %r49, %r18;
add.s32 %r182, %r181, %r43;
mad.lo.s32 %r183, %r182, %r67, %r20;
mul.wide.s32 %rd37, %r183, 2;
add.s64 %rd36, %rd13, %rd37;
mov.b32 %r180, {%rs15, %rs16};
mov.b32 %r179, {%rs13, %rs14};
// begin inline asm
st.global.cs.v2.s32 [%rd36], {%r179,%r180};
// end inline asm
$L__BB0_77:
mov.u32 %r184, -16;
sub.s32 %r185, %r184, %r49;
setp.ge.s32 %p46, %r47, %r185;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
add.s32 %r188, %r49, %r18;
add.s32 %r189, %r188, %r43;
add.s32 %r190, %r189, 16;
mad.lo.s32 %r191, %r190, %r67, %r20;
mul.wide.s32 %rd39, %r191, 2;
add.s64 %rd38, %rd13, %rd39;
mov.b32 %r187, {%rs19, %rs20};
mov.b32 %r186, {%rs17, %rs18};
// begin inline asm
st.global.cs.v2.s32 [%rd38], {%r186,%r187};
// end inline asm
$L__BB0_80:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_1[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_2[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5[24]
)
{
.reg .pred %p<48>;
.reg .b16 %rs<29>;
.reg .f32 %f<269>;
.reg .b32 %r<266>;
.reg .f64 %fd<3>;
.reg .b64 %rd<69>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0+16];
ld.param.v2.u32 {%r66, %r67}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4+8];
ld.param.u64 %rd15, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5];
ld.param.u64 %rd14, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_1];
ld.param.u64 %rd13, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0];
ld.param.u64 %rd3, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_3];
ld.param.u64 %rd16, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_2];
cvta.to.global.u64 %rd1, %rd13;
cvta.to.global.u64 %rd2, %rd16;
ld.param.u64 %rd4, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd17, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r73, [%rd17], %r2;
ld.shared.u32 %r4, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_723310nvfuser_39ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r67;
rcp.rn.f64 %fd1, %fd2;
mul.wide.s32 %rd18, %r2, 4;
mov.u64 %rd19, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_39_cu_72c8bc07_72335arrayE;
add.s64 %rd6, %rd19, %rd18;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r74, %r67, 31;
shr.s32 %r75, %r74, 31;
shr.u32 %r76, %r75, 27;
add.s32 %r77, %r74, %r76;
shr.s32 %r6, %r77, 5;
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd20, %rd14;
mul.wide.s32 %rd21, %r8, 4;
add.s64 %rd8, %rd20, %rd21;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
add.s32 %r10, %r9, 16;
add.s32 %r82, %r10, %r7;
setp.gt.s32 %p4, %r82, 215;
@%p4 bra $L__BB0_7;
rem.s32 %r83, %r5, %r6;
shl.b32 %r11, %r83, 5;
or.b32 %r84, %r11, 31;
setp.ge.s32 %p5, %r84, %r67;
@%p5 bra $L__BB0_7;
shr.u32 %r86, %r79, 27;
add.s32 %r87, %r2, %r86;
and.b32 %r88, %r87, -32;
sub.s32 %r89, %r2, %r88;
add.s32 %r12, %r7, %r89;
setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f165, [%rd8];
st.shared.f32 [%rd6+4608], %f165;
and.b32 %r220, %r81, -8;
sub.s32 %r221, %r2, %r220;
shl.b32 %r222, %r221, 2;
shl.b32 %r223, %r4, 5;
add.s32 %r224, %r7, %r9;
add.s32 %r225, %r224, %r223;
mad.lo.s32 %r226, %r225, %r67, %r222;
add.s32 %r227, %r226, %r11;
mul.wide.s32 %rd51, %r227, 4;
add.s64 %rd45, %rd4, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r197,%r198,%r199,%r200}, [%rd45];
// end inline asm
add.s32 %r228, %r225, 16;
mad.lo.s32 %r229, %r228, %r67, %r222;
add.s32 %r230, %r229, %r11;
mul.wide.s32 %rd52, %r230, 4;
add.s64 %rd46, %rd4, %rd52;
// begin inline asm
ld.global.cs.v4.u32 {%r201,%r202,%r203,%r204}, [%rd46];
// end inline asm
shl.b64 %rd53, %rd7, 2;
add.s64 %rd54, %rd2, %rd53;
ld.global.f32 %f166, [%rd54];
st.shared.f32 [%rd6+4096], %f166;
mul.lo.s32 %r231, %r4, 96;
add.s32 %r232, %r225, %r231;
mad.lo.s32 %r233, %r232, %r67, %r222;
add.s32 %r234, %r233, %r11;
mul.wide.s32 %rd55, %r234, 4;
add.s64 %rd47, %rd3, %rd55;
// begin inline asm
ld.global.cs.v4.u32 {%r205,%r206,%r207,%r208}, [%rd47];
// end inline asm
add.s32 %r235, %r228, %r231;
mad.lo.s32 %r236, %r235, %r67, %r222;
add.s32 %r237, %r236, %r11;
mul.wide.s32 %rd56, %r237, 4;
add.s64 %rd48, %rd3, %rd56;
// begin inline asm
ld.global.cs.v4.u32 {%r209,%r210,%r211,%r212}, [%rd48];
// end inline asm
mul.lo.s32 %r238, %r12, %r62;
mul.wide.s32 %rd57, %r238, 4;
add.s64 %rd58, %rd1, %rd57;
ld.global.f32 %f167, [%rd58];
st.shared.f32 [%rd6], %f167;
st.shared.f32 [%rd6+512], %f167;
st.shared.f32 [%rd6+1024], %f167;
st.shared.f32 [%rd6+1536], %f167;
st.shared.f32 [%rd6+2048], %f167;
st.shared.f32 [%rd6+2560], %f167;
st.shared.f32 [%rd6+3072], %f167;
st.shared.f32 [%rd6+3584], %f167;
barrier.sync 0;
shl.b32 %r239, %r221, 7;
add.s32 %r240, %r239, %r9;
shr.s32 %r241, %r9, 31;
shr.u32 %r242, %r241, 25;
add.s32 %r243, %r9, %r242;
and.b32 %r244, %r243, -128;
sub.s32 %r245, %r9, %r244;
mul.wide.s32 %rd59, %r245, 4;
add.s64 %rd61, %rd19, 4096;
add.s64 %rd62, %rd61, %rd59;
mov.b32 %f168, %r205;
ld.shared.f32 %f169, [%rd62];
sub.f32 %f170, %f168, %f169;
mov.b32 %f171, %r197;
ld.shared.f32 %f172, [%rd62+512];
mul.f32 %f173, %f172, %f171;
sub.f32 %f174, %f170, %f173;
mul.wide.s32 %rd63, %r240, 4;
add.s64 %rd64, %rd19, %rd63;
ld.shared.f32 %f175, [%rd64];
cvt.rn.f32.f64 %f176, %fd1;
mul.f32 %f177, %f175, %f176;
mul.f32 %f157, %f177, %f174;
mov.b32 %f178, %r206;
sub.f32 %f179, %f178, %f169;
mov.b32 %f180, %r198;
mul.f32 %f181, %f172, %f180;
sub.f32 %f182, %f179, %f181;
ld.shared.f32 %f183, [%rd64+128];
mul.f32 %f184, %f183, %f176;
mul.f32 %f158, %f184, %f182;
// begin inline asm
{ cvt.rn.f16.f32 %rs22, %f158;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs21, %f157;}
// end inline asm
mov.b32 %r213, {%rs21, %rs22};
mov.b32 %f185, %r207;
sub.f32 %f186, %f185, %f169;
mov.b32 %f187, %r199;
mul.f32 %f188, %f172, %f187;
sub.f32 %f189, %f186, %f188;
ld.shared.f32 %f190, [%rd64+256];
mul.f32 %f191, %f190, %f176;
mul.f32 %f159, %f191, %f189;
mov.b32 %f192, %r208;
sub.f32 %f193, %f192, %f169;
mov.b32 %f194, %r200;
mul.f32 %f195, %f172, %f194;
sub.f32 %f196, %f193, %f195;
ld.shared.f32 %f197, [%rd64+384];
mul.f32 %f198, %f197, %f176;
mul.f32 %f160, %f198, %f196;
// begin inline asm
{ cvt.rn.f16.f32 %rs24, %f160;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs23, %f159;}
// end inline asm
mov.b32 %r214, {%rs23, %rs24};
shr.s32 %r246, %r10, 31;
shr.u32 %r247, %r246, 25;
add.s32 %r248, %r10, %r247;
and.b32 %r249, %r248, -128;
sub.s32 %r250, %r10, %r249;
mul.wide.s32 %rd65, %r250, 4;
add.s64 %rd66, %rd61, %rd65;
mov.b32 %f199, %r209;
ld.shared.f32 %f200, [%rd66];
sub.f32 %f201, %f199, %f200;
mov.b32 %f202, %r201;
ld.shared.f32 %f203, [%rd66+512];
mul.f32 %f204, %f203, %f202;
sub.f32 %f205, %f201, %f204;
ld.shared.f32 %f206, [%rd64+64];
mul.f32 %f207, %f206, %f176;
mul.f32 %f161, %f207, %f205;
mov.b32 %f208, %r210;
sub.f32 %f209, %f208, %f200;
mov.b32 %f210, %r202;
mul.f32 %f211, %f203, %f210;
sub.f32 %f212, %f209, %f211;
ld.shared.f32 %f213, [%rd64+192];
mul.f32 %f214, %f213, %f176;
mul.f32 %f162, %f214, %f212;
// begin inline asm
{ cvt.rn.f16.f32 %rs26, %f162;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs25, %f161;}
// end inline asm
mov.b32 %r215, {%rs25, %rs26};
mov.b32 %f215, %r211;
sub.f32 %f216, %f215, %f200;
mov.b32 %f217, %r203;
mul.f32 %f218, %f203, %f217;
sub.f32 %f219, %f216, %f218;
ld.shared.f32 %f220, [%rd64+320];
mul.f32 %f221, %f220, %f176;
mul.f32 %f163, %f221, %f219;
mov.b32 %f222, %r212;
sub.f32 %f223, %f222, %f200;
mov.b32 %f224, %r204;
mul.f32 %f225, %f203, %f224;
sub.f32 %f226, %f223, %f225;
ld.shared.f32 %f227, [%rd64+448];
mul.f32 %f228, %f227, %f176;
mul.f32 %f164, %f228, %f226;
// begin inline asm
{ cvt.rn.f16.f32 %rs28, %f164;}
// end inline asm
// begin inline asm
{ cvt.rn.f16.f32 %rs27, %f163;}
// end inline asm
mov.b32 %r216, {%rs27, %rs28};
shl.b32 %r251, %r4, 10;
add.s32 %r252, %r224, %r251;
mad.lo.s32 %r253, %r252, %r67, %r222;
add.s32 %r254, %r253, %r11;
mul.wide.s32 %rd67, %r254, 2;
add.s64 %rd49, %rd15, %rd67;
// begin inline asm
st.global.cs.v2.s32 [%rd49], {%r213,%r214};
// end inline asm
add.s32 %r255, %r252, 16;
mad.lo.s32 %r256, %r255, %r67, %r222;
add.s32 %r257, %r256, %r11;
mul.wide.s32 %rd68, %r257, 2;
add.s64 %rd50, %rd15, %rd68;
// begin inline asm
st.global.cs.v2.s32 [%rd50], {%r215,%r216};
// end inline asm
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f82, [%rd8];
st.shared.f32 [%rd6+4608], %f82;
$L__BB0_9:
mov.u32 %r13, %ctaid.x;
add.s32 %r94, %r67, 31;
shr.s32 %r95, %r94, 31;
shr.u32 %r96, %r95, 27;
add.s32 %r97, %r94, %r96;
shr.s32 %r14, %r97, 5;
shl.b32 %r15, %r4, 5;
shr.s32 %r98, %r2, 31;
shr.u32 %r99, %r98, 29;
add.s32 %r100, %r2, %r99;
and.b32 %r101, %r100, -8;
sub.s32 %r16, %r2, %r101;
shl.b32 %r102, %r16, 2;
rem.s32 %r103, %r13, %r14;
shl.b32 %r104, %r103, 5;
add.s32 %r20, %r104, %r102;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r67;
shr.s32 %r18, %r100, 3;
add.s32 %r19, %r18, -216;
mov.u32 %r258, 0;
mov.u32 %r259, %r258;
mov.u32 %r260, %r258;
mov.u32 %r261, %r258;
@%p8 bra $L__BB0_12;
div.s32 %r109, %r13, %r14;
shl.b32 %r21, %r109, 5;
add.s32 %r110, %r19, %r21;
neg.s32 %r111, %r15;
setp.ge.s32 %p9, %r110, %r111;
@%p9 bra $L__BB0_12;
add.s32 %r116, %r15, %r18;
add.s32 %r117, %r116, %r21;
mad.lo.s32 %r118, %r117, %r67, %r20;
mul.wide.s32 %rd23, %r118, 4;
add.s64 %rd22, %rd4, %rd23;
// begin inline asm
ld.global.cs.v4.u32 {%r261,%r260,%r259,%r258}, [%rd22];
// end inline asm
$L__BB0_12:
mov.f32 %f237, 0f00000000;
mov.f32 %f238, 0f00000000;
mov.f32 %f239, 0f00000000;
mov.f32 %f240, 0f00000000;
@%p8 bra $L__BB0_15;
div.s32 %r119, %r13, %r14;
shl.b32 %r30, %r119, 5;
add.s32 %r120, %r19, %r30;
mov.u32 %r121, -16;
sub.s32 %r122, %r121, %r15;
setp.ge.s32 %p11, %r120, %r122;
@%p11 bra $L__BB0_15;
add.s32 %r127, %r15, %r18;
add.s32 %r128, %r127, %r30;
add.s32 %r129, %r128, 16;
mad.lo.s32 %r130, %r129, %r67, %r20;
mul.wide.s32 %rd25, %r130, 4;
add.s64 %rd24, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r123,%r124,%r125,%r126}, [%rd24];
// end inline asm
mov.b32 %f240, %r123;
mov.b32 %f239, %r124;
mov.b32 %f238, %r125;
mov.b32 %f237, %r126;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
div.s32 %r131, %r13, %r14;
shl.b32 %r132, %r131, 5;
add.s32 %r32, %r132, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
mul.wide.s32 %rd26, %r32, 4;
add.s64 %rd27, %rd2, %rd26;
ld.global.f32 %f91, [%rd27];
st.shared.f32 [%rd6+4096], %f91;
$L__BB0_18:
mov.u32 %r262, 0;
mov.u32 %r263, %r262;
mov.u32 %r264, %r262;
mov.u32 %r265, %r262;
@%p8 bra $L__BB0_21;
div.s32 %r141, %r13, %r14;
shl.b32 %r33, %r141, 5;
add.s32 %r142, %r19, %r33;
neg.s32 %r143, %r31;
setp.ge.s32 %p15, %r142, %r143;
@%p15 bra $L__BB0_21;
add.s32 %r148, %r31, %r18;
add.s32 %r149, %r148, %r33;
mad.lo.s32 %r150, %r149, %r67, %r20;
mul.wide.s32 %rd29, %r150, 4;
add.s64 %rd28, %rd3, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r265,%r264,%r263,%r262}, [%rd28];
// end inline asm
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r67;
mov.f32 %f241, 0f00000000;
mov.f32 %f242, 0f00000000;
mov.f32 %f243, 0f00000000;
mov.f32 %f244, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
div.s32 %r151, %r13, %r14;
shl.b32 %r42, %r151, 5;
add.s32 %r152, %r19, %r42;
mov.u32 %r153, -16;
sub.s32 %r154, %r153, %r31;
setp.ge.s32 %p17, %r152, %r154;
@%p17 bra $L__BB0_24;
add.s32 %r159, %r31, %r18;
add.s32 %r160, %r159, %r42;
add.s32 %r161, %r160, 16;
mad.lo.s32 %r162, %r161, %r67, %r20;
mul.wide.s32 %rd31, %r162, 4;
add.s64 %rd30, %rd3, %rd31;
// begin inline asm
ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd30];
// end inline asm
mov.b32 %f244, %r155;
mov.b32 %f243, %r156;
mov.b32 %f242, %r157;
mov.b32 %f241, %r158;
$L__BB0_24:
div.s32 %r163, %r13, %r14;
shl.b32 %r43, %r163, 5;
shr.u32 %r165, %r98, 27;
add.s32 %r166, %r2, %r165;
and.b32 %r167, %r166, -32;
sub.s32 %r168, %r2, %r167;
add.s32 %r169, %r43, %r168;
setp.gt.s32 %p18, %r169, 215;
mul.lo.s32 %r170, %r169, %r62;
mul.wide.s32 %rd32, %r170, 4;
add.s64 %rd9, %rd1, %rd32;
@%p18 bra $L__BB0_26;
ld.global.f32 %f100, [%rd9];
st.shared.f32 [%rd6], %f100;
st.shared.f32 [%rd6+512], %f100;
st.shared.f32 [%rd6+1024], %f100;
st.shared.f32 [%rd6+1536], %f100;
st.shared.f32 [%rd6+2048], %f100;
st.shared.f32 [%rd6+2560], %f100;
st.shared.f32 [%rd6+3072], %f100;
st.shared.f32 [%rd6+3584], %f100;
$L__BB0_26:
shl.b32 %r44, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
neg.s32 %r45, %r44;
add.s32 %r46, %r19, %r43;
setp.ge.s32 %p19, %r46, %r45;
shr.s32 %r171, %r18, 31;
shr.u32 %r172, %r171, 25;
add.s32 %r173, %r18, %r172;
and.b32 %r174, %r173, -128;
sub.s32 %r175, %r18, %r174;
mul.wide.s32 %rd33, %r175, 4;
add.s64 %rd35, %rd19, %rd33;
add.s64 %rd10, %rd35, 4096;
mov.f32 %f246, 0f00000000;
mov.f32 %f245, %f246;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f245, [%rd10];
$L__BB0_28:
mov.b32 %f103, %r265;
sub.f32 %f20, %f103, %f245;
@%p19 bra $L__BB0_30;
ld.shared.f32 %f246, [%rd10+512];
$L__BB0_30:
mov.b32 %f105, %r261;
mul.f32 %f106, %f246, %f105;
sub.f32 %f23, %f20, %f106;
shl.b32 %r176, %r16, 7;
add.s32 %r177, %r176, %r18;
mul.wide.s32 %rd36, %r177, 4;
add.s64 %rd11, %rd19, %rd36;
mov.f32 %f248, 0f00000000;
mov.f32 %f247, %f248;
@%p19 bra $L__BB0_32;
ld.shared.f32 %f107, [%rd11];
mul.f32 %f247, %f107, %f17;
$L__BB0_32:
mul.f32 %f108, %f23, %f247;
// begin inline asm
{ cvt.rn.f16.f32 %rs13, %f108;}
// end inline asm
@%p19 bra $L__BB0_34;
ld.shared.f32 %f248, [%rd10];
$L__BB0_34:
mov.b32 %f111, %r264;
sub.f32 %f28, %f111, %f248;
mov.f32 %f250, 0f00000000;
mov.f32 %f249, %f250;
@%p19 bra $L__BB0_36;
ld.shared.f32 %f249, [%rd10+512];
$L__BB0_36:
mov.b32 %f113, %r260;
mul.f32 %f114, %f249, %f113;
sub.f32 %f31, %f28, %f114;
@%p19 bra $L__BB0_38;
ld.shared.f32 %f115, [%rd11+128];
mul.f32 %f250, %f115, %f17;
$L__BB0_38:
mul.f32 %f116, %f31, %f250;
// begin inline asm
{ cvt.rn.f16.f32 %rs14, %f116;}
// end inline asm
mov.f32 %f252, 0f00000000;
mov.f32 %f251, %f252;
@%p19 bra $L__BB0_40;
ld.shared.f32 %f251, [%rd10];
$L__BB0_40:
mov.b32 %f119, %r263;
sub.f32 %f36, %f119, %f251;
@%p19 bra $L__BB0_42;
ld.shared.f32 %f252, [%rd10+512];
$L__BB0_42:
mov.b32 %f121, %r259;
mul.f32 %f122, %f252, %f121;
sub.f32 %f39, %f36, %f122;
mov.f32 %f254, 0f00000000;
mov.f32 %f253, %f254;
@%p19 bra $L__BB0_44;
ld.shared.f32 %f123, [%rd11+256];
mul.f32 %f253, %f123, %f17;
$L__BB0_44:
mul.f32 %f124, %f39, %f253;
// begin inline asm
{ cvt.rn.f16.f32 %rs15, %f124;}
// end inline asm
@%p19 bra $L__BB0_46;
ld.shared.f32 %f254, [%rd10];
$L__BB0_46:
mov.b32 %f127, %r262;
sub.f32 %f44, %f127, %f254;
mov.f32 %f256, 0f00000000;
mov.f32 %f255, %f256;
@%p19 bra $L__BB0_48;
ld.shared.f32 %f255, [%rd10+512];
$L__BB0_48:
mov.b32 %f129, %r258;
mul.f32 %f130, %f255, %f129;
sub.f32 %f47, %f44, %f130;
@%p19 bra $L__BB0_50;
ld.shared.f32 %f131, [%rd11+384];
mul.f32 %f256, %f131, %f17;
$L__BB0_50:
mul.f32 %f132, %f47, %f256;
// begin inline asm
{ cvt.rn.f16.f32 %rs16, %f132;}
// end inline asm
mov.u32 %r178, -16;
sub.s32 %r47, %r178, %r44;
setp.ge.s32 %p31, %r46, %r47;
add.s32 %r48, %r18, 16;
shr.s32 %r179, %r48, 31;
shr.u32 %r180, %r179, 25;
add.s32 %r181, %r48, %r180;
and.b32 %r182, %r181, -128;
sub.s32 %r183, %r48, %r182;
mul.wide.s32 %rd38, %r183, 4;
add.s64 %rd40, %rd19, %rd38;
add.s64 %rd12, %rd40, 4096;
mov.f32 %f258, 0f00000000;
mov.f32 %f257, %f258;
@%p31 bra $L__BB0_52;
ld.shared.f32 %f257, [%rd12];
$L__BB0_52:
sub.f32 %f52, %f244, %f257;
@%p31 bra $L__BB0_54;
ld.shared.f32 %f258, [%rd12+512];
$L__BB0_54:
mul.f32 %f136, %f258, %f240;
sub.f32 %f55, %f52, %f136;
mov.f32 %f260, 0f00000000;
mov.f32 %f259, %f260;
@%p31 bra $L__BB0_56;
ld.shared.f32 %f137, [%rd11+64];
mul.f32 %f259, %f137, %f17;
$L__BB0_56:
mul.f32 %f138, %f55, %f259;
// begin inline asm
{ cvt.rn.f16.f32 %rs17, %f138;}
// end inline asm
@%p31 bra $L__BB0_58;
ld.shared.f32 %f260, [%rd12];
$L__BB0_58:
sub.f32 %f60, %f243, %f260;
mov.f32 %f262, 0f00000000;
mov.f32 %f261, %f262;
@%p31 bra $L__BB0_60;
ld.shared.f32 %f261, [%rd12+512];
$L__BB0_60:
mul.f32 %f142, %f261, %f239;
sub.f32 %f63, %f60, %f142;
@%p31 bra $L__BB0_62;
ld.shared.f32 %f143, [%rd11+192];
mul.f32 %f262, %f143, %f17;
$L__BB0_62:
mul.f32 %f144, %f63, %f262;
// begin inline asm
{ cvt.rn.f16.f32 %rs18, %f144;}
// end inline asm
mov.f32 %f264, 0f00000000;
mov.f32 %f263, %f264;
@%p31 bra $L__BB0_64;
ld.shared.f32 %f263, [%rd12];
$L__BB0_64:
sub.f32 %f68, %f242, %f263;
@%p31 bra $L__BB0_66;
ld.shared.f32 %f264, [%rd12+512];
$L__BB0_66:
mul.f32 %f148, %f264, %f238;
sub.f32 %f71, %f68, %f148;
mov.f32 %f266, 0f00000000;
mov.f32 %f265, %f266;
@%p31 bra $L__BB0_68;
ld.shared.f32 %f149, [%rd11+320];
mul.f32 %f265, %f149, %f17;
$L__BB0_68:
mul.f32 %f150, %f71, %f265;
// begin inline asm
{ cvt.rn.f16.f32 %rs19, %f150;}
// end inline asm
@%p31 bra $L__BB0_70;
ld.shared.f32 %f266, [%rd12];
$L__BB0_70:
sub.f32 %f76, %f241, %f266;
mov.f32 %f268, 0f00000000;
mov.f32 %f267, %f268;
@%p31 bra $L__BB0_72;
ld.shared.f32 %f267, [%rd12+512];
$L__BB0_72:
mul.f32 %f154, %f267, %f237;
sub.f32 %f79, %f76, %f154;
@%p31 bra $L__BB0_74;
ld.shared.f32 %f155, [%rd11+448];
mul.f32 %f268, %f155, %f17;
$L__BB0_74:
mul.f32 %f156, %f79, %f268;
// begin inline asm
{ cvt.rn.f16.f32 %rs20, %f156;}
// end inline asm
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
neg.s32 %r184, %r49;
setp.ge.s32 %p44, %r46, %r184;
@%p44 bra $L__BB0_77;
add.s32 %r187, %r49, %r18;
add.s32 %r188, %r187, %r43;
mad.lo.s32 %r189, %r188, %r67, %r20;
mul.wide.s32 %rd42, %r189, 2;
add.s64 %rd41, %rd15, %rd42;
mov.b32 %r186, {%rs15, %rs16};
mov.b32 %r185, {%rs13, %rs14};
// begin inline asm
st.global.cs.v2.s32 [%rd41], {%r185,%r186};
// end inline asm
$L__BB0_77:
mov.u32 %r190, -16;
sub.s32 %r191, %r190, %r49;
setp.ge.s32 %p46, %r46, %r191;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
add.s32 %r194, %r48, %r49;
add.s32 %r195, %r194, %r43;
mad.lo.s32 %r196, %r195, %r67, %r20;
mul.wide.s32 %rd44, %r196, 2;
add.s64 %rd43, %rd15, %rd44;
mov.b32 %r193, {%rs19, %rs20};
mov.b32 %r192, {%rs17, %rs18};
// begin inline asm
st.global.cs.v2.s32 [%rd43], {%r192,%r193};
// end inline asm
$L__BB0_80:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -25,44 +25,44 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5[24]
)
{
.reg .pred %p<48>;
.reg .b16 %rs<29>;
- .reg .f32 %f<257>;
- .reg .b32 %r<243>;
+ .reg .f32 %f<269>;
+ .reg .b32 %r<266>;
.reg .f64 %fd<3>;
- .reg .b64 %rd<61>;
+ .reg .b64 %rd<69>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0+16];
ld.param.v2.u32 {%r66, %r67}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4+8];
- ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5];
- ld.param.u64 %rd12, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_1];
- ld.param.u64 %rd11, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0];
+ ld.param.u64 %rd15, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_5];
+ ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_1];
+ ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_0];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_3];
- ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_2];
- cvta.to.global.u64 %rd1, %rd11;
- cvta.to.global.u64 %rd2, %rd14;
+ ld.param.u64 %rd16, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_2];
+ cvta.to.global.u64 %rd1, %rd13;
+ cvta.to.global.u64 %rd2, %rd16;
ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEE_param_4];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd15, _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r73, [%rd15], %r2;
+ mov.u64 %rd17, _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s;
+ atom.shared.min.s32 %r73, [%rd17], %r2;
ld.shared.u32 %r4, [_ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_6__halfELi2ELi2EEEE14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r67;
rcp.rn.f64 %fd1, %fd2;
- mul.wide.s32 %rd16, %r2, 4;
- mov.u64 %rd17, _ZN11kernelscope6kernelE;
- add.s64 %rd6, %rd17, %rd16;
+ mul.wide.s32 %rd18, %r2, 4;
+ mov.u64 %rd19, _ZN11kernelscope6kernelE;
+ add.s64 %rd6, %rd19, %rd18;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
@@ -75,646 +75,702 @@
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
- cvta.to.global.u64 %rd18, %rd12;
- mul.wide.s32 %rd19, %r8, 4;
- add.s64 %rd8, %rd18, %rd19;
+ cvta.to.global.u64 %rd20, %rd14;
+ mul.wide.s32 %rd21, %r8, 4;
+ add.s64 %rd8, %rd20, %rd21;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
- add.s32 %r82, %r9, %r7;
- add.s32 %r83, %r82, 16;
- setp.gt.s32 %p4, %r83, 215;
+ add.s32 %r10, %r9, 16;
+ add.s32 %r82, %r10, %r7;
+ setp.gt.s32 %p4, %r82, 215;
@%p4 bra $L__BB0_7;
- and.b32 %r87, %r81, 1073741816;
- sub.s32 %r88, %r2, %r87;
- shl.b32 %r10, %r88, 2;
- rem.s32 %r89, %r5, %r6;
- shl.b32 %r11, %r89, 5;
- add.s32 %r90, %r10, %r11;
- or.b32 %r91, %r90, 3;
- setp.ge.s32 %p5, %r91, %r67;
+ rem.s32 %r83, %r5, %r6;
+ shl.b32 %r11, %r83, 5;
+ or.b32 %r84, %r11, 31;
+ setp.ge.s32 %p5, %r84, %r67;
@%p5 bra $L__BB0_7;
- shr.u32 %r93, %r79, 27;
- add.s32 %r94, %r2, %r93;
- and.b32 %r95, %r94, -32;
- sub.s32 %r12, %r2, %r95;
- add.s32 %r13, %r7, %r12;
- setp.lt.s32 %p6, %r13, 216;
+ shr.u32 %r86, %r79, 27;
+ add.s32 %r87, %r2, %r86;
+ and.b32 %r88, %r87, -32;
+ sub.s32 %r89, %r2, %r88;
+ add.s32 %r12, %r7, %r89;
+ setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f165, [%rd8];
st.shared.f32 [%rd6+4608], %f165;
- shl.b32 %r212, %r4, 5;
- add.s32 %r213, %r7, %r9;
- add.s32 %r214, %r213, %r212;
- mad.lo.s32 %r215, %r214, %r67, %r10;
- add.s32 %r216, %r215, %r11;
- mul.wide.s32 %rd46, %r216, 4;
- add.s64 %rd40, %rd4, %rd46;
-
- ld.global.cs.v4.u32 {%r192,%r193,%r194,%r195}, [%rd40];
-
- add.s32 %r217, %r214, 16;
- mad.lo.s32 %r218, %r217, %r67, %r10;
- add.s32 %r219, %r218, %r11;
- mul.wide.s32 %rd47, %r219, 4;
- add.s64 %rd41, %rd4, %rd47;
-
- ld.global.cs.v4.u32 {%r196,%r197,%r198,%r199}, [%rd41];
-
- shl.b64 %rd48, %rd7, 2;
- add.s64 %rd49, %rd2, %rd48;
- ld.global.f32 %f166, [%rd49];
+ and.b32 %r220, %r81, -8;
+ sub.s32 %r221, %r2, %r220;
+ shl.b32 %r222, %r221, 2;
+ shl.b32 %r223, %r4, 5;
+ add.s32 %r224, %r7, %r9;
+ add.s32 %r225, %r224, %r223;
+ mad.lo.s32 %r226, %r225, %r67, %r222;
+ add.s32 %r227, %r226, %r11;
+ mul.wide.s32 %rd51, %r227, 4;
+ add.s64 %rd45, %rd4, %rd51;
+
+ ld.global.cs.v4.u32 {%r197,%r198,%r199,%r200}, [%rd45];
+
+ add.s32 %r228, %r225, 16;
+ mad.lo.s32 %r229, %r228, %r67, %r222;
+ add.s32 %r230, %r229, %r11;
+ mul.wide.s32 %rd52, %r230, 4;
+ add.s64 %rd46, %rd4, %rd52;
+
+ ld.global.cs.v4.u32 {%r201,%r202,%r203,%r204}, [%rd46];
+
+ shl.b64 %rd53, %rd7, 2;
+ add.s64 %rd54, %rd2, %rd53;
+ ld.global.f32 %f166, [%rd54];
st.shared.f32 [%rd6+4096], %f166;
- mul.lo.s32 %r220, %r4, 96;
- add.s32 %r221, %r214, %r220;
- mad.lo.s32 %r222, %r221, %r67, %r10;
- add.s32 %r223, %r222, %r11;
- mul.wide.s32 %rd50, %r223, 4;
- add.s64 %rd42, %rd3, %rd50;
-
- ld.global.cs.v4.u32 {%r200,%r201,%r202,%r203}, [%rd42];
-
- add.s32 %r224, %r217, %r220;
- mad.lo.s32 %r225, %r224, %r67, %r10;
- add.s32 %r226, %r225, %r11;
- mul.wide.s32 %rd51, %r226, 4;
- add.s64 %rd43, %rd3, %rd51;
-
- ld.global.cs.v4.u32 {%r204,%r205,%r206,%r207}, [%rd43];
-
- mul.lo.s32 %r227, %r13, %r62;
- mul.wide.s32 %rd52, %r227, 4;
- add.s64 %rd53, %rd1, %rd52;
- mul.wide.s32 %rd54, %r12, 4;
- add.s64 %rd56, %rd17, %rd54;
- ld.global.f32 %f167, [%rd53];
- st.shared.f32 [%rd56], %f167;
+ mul.lo.s32 %r231, %r4, 96;
+ add.s32 %r232, %r225, %r231;
+ mad.lo.s32 %r233, %r232, %r67, %r222;
+ add.s32 %r234, %r233, %r11;
+ mul.wide.s32 %rd55, %r234, 4;
+ add.s64 %rd47, %rd3, %rd55;
+
+ ld.global.cs.v4.u32 {%r205,%r206,%r207,%r208}, [%rd47];
+
+ add.s32 %r235, %r228, %r231;
+ mad.lo.s32 %r236, %r235, %r67, %r222;
+ add.s32 %r237, %r236, %r11;
+ mul.wide.s32 %rd56, %r237, 4;
+ add.s64 %rd48, %rd3, %rd56;
+
+ ld.global.cs.v4.u32 {%r209,%r210,%r211,%r212}, [%rd48];
+
+ mul.lo.s32 %r238, %r12, %r62;
+ mul.wide.s32 %rd57, %r238, 4;
+ add.s64 %rd58, %rd1, %rd57;
+ ld.global.f32 %f167, [%rd58];
+ st.shared.f32 [%rd6], %f167;
+ st.shared.f32 [%rd6+512], %f167;
+ st.shared.f32 [%rd6+1024], %f167;
+ st.shared.f32 [%rd6+1536], %f167;
+ st.shared.f32 [%rd6+2048], %f167;
+ st.shared.f32 [%rd6+2560], %f167;
+ st.shared.f32 [%rd6+3072], %f167;
+ st.shared.f32 [%rd6+3584], %f167;
barrier.sync 0;
- mul.wide.s32 %rd57, %r9, 4;
- add.s64 %rd58, %rd17, %rd57;
- ld.shared.f32 %f168, [%rd58];
- cvt.rn.f32.f64 %f169, %fd1;
- mul.f32 %f170, %f168, %f169;
- mov.b32 %f171, %r200;
- ld.shared.f32 %f172, [%rd58+4096];
- sub.f32 %f173, %f171, %f172;
- mov.b32 %f174, %r192;
- ld.shared.f32 %f175, [%rd58+4608];
- mul.f32 %f176, %f175, %f174;
- sub.f32 %f177, %f173, %f176;
- mul.f32 %f157, %f170, %f177;
- mov.b32 %f178, %r201;
- sub.f32 %f179, %f178, %f172;
- mov.b32 %f180, %r193;
- mul.f32 %f181, %f175, %f180;
+ shl.b32 %r239, %r221, 7;
+ add.s32 %r240, %r239, %r9;
+ shr.s32 %r241, %r9, 31;
+ shr.u32 %r242, %r241, 25;
+ add.s32 %r243, %r9, %r242;
+ and.b32 %r244, %r243, -128;
+ sub.s32 %r245, %r9, %r244;
+ mul.wide.s32 %rd59, %r245, 4;
+ add.s64 %rd61, %rd19, 4096;
+ add.s64 %rd62, %rd61, %rd59;
+ mov.b32 %f168, %r205;
+ ld.shared.f32 %f169, [%rd62];
+ sub.f32 %f170, %f168, %f169;
+ mov.b32 %f171, %r197;
+ ld.shared.f32 %f172, [%rd62+512];
+ mul.f32 %f173, %f172, %f171;
+ sub.f32 %f174, %f170, %f173;
+ mul.wide.s32 %rd63, %r240, 4;
+ add.s64 %rd64, %rd19, %rd63;
+ ld.shared.f32 %f175, [%rd64];
+ cvt.rn.f32.f64 %f176, %fd1;
+ mul.f32 %f177, %f175, %f176;
+ mul.f32 %f157, %f177, %f174;
+ mov.b32 %f178, %r206;
+ sub.f32 %f179, %f178, %f169;
+ mov.b32 %f180, %r198;
+ mul.f32 %f181, %f172, %f180;
sub.f32 %f182, %f179, %f181;
- mul.f32 %f158, %f170, %f182;
+ ld.shared.f32 %f183, [%rd64+128];
+ mul.f32 %f184, %f183, %f176;
+ mul.f32 %f158, %f184, %f182;
{ cvt.rn.f16.f32 %rs22, %f158;}
{ cvt.rn.f16.f32 %rs21, %f157;}
- mov.b32 %r208, {%rs21, %rs22};
- mov.b32 %f183, %r202;
- sub.f32 %f184, %f183, %f172;
- mov.b32 %f185, %r194;
- mul.f32 %f186, %f175, %f185;
- sub.f32 %f187, %f184, %f186;
- mul.f32 %f159, %f170, %f187;
- mov.b32 %f188, %r203;
- sub.f32 %f189, %f188, %f172;
- mov.b32 %f190, %r195;
- mul.f32 %f191, %f175, %f190;
- sub.f32 %f192, %f189, %f191;
- mul.f32 %f160, %f170, %f192;
+ mov.b32 %r213, {%rs21, %rs22};
+ mov.b32 %f185, %r207;
+ sub.f32 %f186, %f185, %f169;
+ mov.b32 %f187, %r199;
+ mul.f32 %f188, %f172, %f187;
+ sub.f32 %f189, %f186, %f188;
+ ld.shared.f32 %f190, [%rd64+256];
+ mul.f32 %f191, %f190, %f176;
+ mul.f32 %f159, %f191, %f189;
+ mov.b32 %f192, %r208;
+ sub.f32 %f193, %f192, %f169;
+ mov.b32 %f194, %r200;
+ mul.f32 %f195, %f172, %f194;
+ sub.f32 %f196, %f193, %f195;
+ ld.shared.f32 %f197, [%rd64+384];
+ mul.f32 %f198, %f197, %f176;
+ mul.f32 %f160, %f198, %f196;
{ cvt.rn.f16.f32 %rs24, %f160;}
{ cvt.rn.f16.f32 %rs23, %f159;}
- mov.b32 %r209, {%rs23, %rs24};
- ld.shared.f32 %f193, [%rd58+64];
- mul.f32 %f194, %f193, %f169;
- mov.b32 %f195, %r204;
- ld.shared.f32 %f196, [%rd58+4160];
- sub.f32 %f197, %f195, %f196;
- mov.b32 %f198, %r196;
- ld.shared.f32 %f199, [%rd58+4672];
- mul.f32 %f200, %f199, %f198;
- sub.f32 %f201, %f197, %f200;
- mul.f32 %f161, %f194, %f201;
- mov.b32 %f202, %r205;
- sub.f32 %f203, %f202, %f196;
- mov.b32 %f204, %r197;
- mul.f32 %f205, %f199, %f204;
- sub.f32 %f206, %f203, %f205;
- mul.f32 %f162, %f194, %f206;
+ mov.b32 %r214, {%rs23, %rs24};
+ shr.s32 %r246, %r10, 31;
+ shr.u32 %r247, %r246, 25;
+ add.s32 %r248, %r10, %r247;
+ and.b32 %r249, %r248, -128;
+ sub.s32 %r250, %r10, %r249;
+ mul.wide.s32 %rd65, %r250, 4;
+ add.s64 %rd66, %rd61, %rd65;
+ mov.b32 %f199, %r209;
+ ld.shared.f32 %f200, [%rd66];
+ sub.f32 %f201, %f199, %f200;
+ mov.b32 %f202, %r201;
+ ld.shared.f32 %f203, [%rd66+512];
+ mul.f32 %f204, %f203, %f202;
+ sub.f32 %f205, %f201, %f204;
+ ld.shared.f32 %f206, [%rd64+64];
+ mul.f32 %f207, %f206, %f176;
+ mul.f32 %f161, %f207, %f205;
+ mov.b32 %f208, %r210;
+ sub.f32 %f209, %f208, %f200;
+ mov.b32 %f210, %r202;
+ mul.f32 %f211, %f203, %f210;
+ sub.f32 %f212, %f209, %f211;
+ ld.shared.f32 %f213, [%rd64+192];
+ mul.f32 %f214, %f213, %f176;
+ mul.f32 %f162, %f214, %f212;
{ cvt.rn.f16.f32 %rs26, %f162;}
{ cvt.rn.f16.f32 %rs25, %f161;}
- mov.b32 %r210, {%rs25, %rs26};
- mov.b32 %f207, %r206;
- sub.f32 %f208, %f207, %f196;
- mov.b32 %f209, %r198;
- mul.f32 %f210, %f199, %f209;
- sub.f32 %f211, %f208, %f210;
- mul.f32 %f163, %f194, %f211;
- mov.b32 %f212, %r207;
- sub.f32 %f213, %f212, %f196;
- mov.b32 %f214, %r199;
- mul.f32 %f215, %f199, %f214;
- sub.f32 %f216, %f213, %f215;
- mul.f32 %f164, %f194, %f216;
+ mov.b32 %r215, {%rs25, %rs26};
+ mov.b32 %f215, %r211;
+ sub.f32 %f216, %f215, %f200;
+ mov.b32 %f217, %r203;
+ mul.f32 %f218, %f203, %f217;
+ sub.f32 %f219, %f216, %f218;
+ ld.shared.f32 %f220, [%rd64+320];
+ mul.f32 %f221, %f220, %f176;
+ mul.f32 %f163, %f221, %f219;
+ mov.b32 %f222, %r212;
+ sub.f32 %f223, %f222, %f200;
+ mov.b32 %f224, %r204;
+ mul.f32 %f225, %f203, %f224;
+ sub.f32 %f226, %f223, %f225;
+ ld.shared.f32 %f227, [%rd64+448];
+ mul.f32 %f228, %f227, %f176;
+ mul.f32 %f164, %f228, %f226;
{ cvt.rn.f16.f32 %rs28, %f164;}
{ cvt.rn.f16.f32 %rs27, %f163;}
- mov.b32 %r211, {%rs27, %rs28};
- mul.lo.s32 %r228, %r4, 896;
- add.s32 %r229, %r221, %r228;
- mad.lo.s32 %r230, %r229, %r67, %r10;
- add.s32 %r231, %r230, %r11;
- mul.wide.s32 %rd59, %r231, 2;
- add.s64 %rd44, %rd13, %rd59;
-
- st.global.cs.v2.s32 [%rd44], {%r208,%r209};
-
- add.s32 %r232, %r224, %r228;
- mad.lo.s32 %r233, %r232, %r67, %r10;
- add.s32 %r234, %r233, %r11;
- mul.wide.s32 %rd60, %r234, 2;
- add.s64 %rd45, %rd13, %rd60;
-
- st.global.cs.v2.s32 [%rd45], {%r210,%r211};
+ mov.b32 %r216, {%rs27, %rs28};
+ shl.b32 %r251, %r4, 10;
+ add.s32 %r252, %r224, %r251;
+ mad.lo.s32 %r253, %r252, %r67, %r222;
+ add.s32 %r254, %r253, %r11;
+ mul.wide.s32 %rd67, %r254, 2;
+ add.s64 %rd49, %rd15, %rd67;
+
+ st.global.cs.v2.s32 [%rd49], {%r213,%r214};
+
+ add.s32 %r255, %r252, 16;
+ mad.lo.s32 %r256, %r255, %r67, %r222;
+ add.s32 %r257, %r256, %r11;
+ mul.wide.s32 %rd68, %r257, 2;
+ add.s64 %rd50, %rd15, %rd68;
+
+ st.global.cs.v2.s32 [%rd50], {%r215,%r216};
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f82, [%rd8];
st.shared.f32 [%rd6+4608], %f82;
$L__BB0_9:
- mov.u32 %r14, %ctaid.x;
- add.s32 %r100, %r67, 31;
- shr.s32 %r101, %r100, 31;
- shr.u32 %r102, %r101, 27;
- add.s32 %r103, %r100, %r102;
- shr.s32 %r15, %r103, 5;
- shl.b32 %r16, %r4, 5;
- shr.s32 %r104, %r2, 31;
- shr.u32 %r105, %r104, 29;
- add.s32 %r106, %r2, %r105;
- and.b32 %r107, %r106, 1073741816;
- sub.s32 %r108, %r2, %r107;
- shl.b32 %r109, %r108, 2;
- rem.s32 %r110, %r14, %r15;
- shl.b32 %r111, %r110, 5;
- add.s32 %r20, %r111, %r109;
+ mov.u32 %r13, %ctaid.x;
+ add.s32 %r94, %r67, 31;
+ shr.s32 %r95, %r94, 31;
+ shr.u32 %r96, %r95, 27;
+ add.s32 %r97, %r94, %r96;
+ shr.s32 %r14, %r97, 5;
+ shl.b32 %r15, %r4, 5;
+ shr.s32 %r98, %r2, 31;
+ shr.u32 %r99, %r98, 29;
+ add.s32 %r100, %r2, %r99;
+ and.b32 %r101, %r100, -8;
+ sub.s32 %r16, %r2, %r101;
+ shl.b32 %r102, %r16, 2;
+ rem.s32 %r103, %r13, %r14;
+ shl.b32 %r104, %r103, 5;
+ add.s32 %r20, %r104, %r102;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r67;
- shr.s32 %r18, %r106, 3;
+ shr.s32 %r18, %r100, 3;
add.s32 %r19, %r18, -216;
- mov.u32 %r235, 0;
- mov.u32 %r236, %r235;
- mov.u32 %r237, %r235;
- mov.u32 %r238, %r235;
+ mov.u32 %r258, 0;
+ mov.u32 %r259, %r258;
+ mov.u32 %r260, %r258;
+ mov.u32 %r261, %r258;
@%p8 bra $L__BB0_12;
- div.s32 %r116, %r14, %r15;
- shl.b32 %r21, %r116, 5;
- add.s32 %r117, %r19, %r21;
- neg.s32 %r118, %r16;
- setp.ge.s32 %p9, %r117, %r118;
+ div.s32 %r109, %r13, %r14;
+ shl.b32 %r21, %r109, 5;
+ add.s32 %r110, %r19, %r21;
+ neg.s32 %r111, %r15;
+ setp.ge.s32 %p9, %r110, %r111;
@%p9 bra $L__BB0_12;
- add.s32 %r123, %r16, %r18;
- add.s32 %r124, %r123, %r21;
- mad.lo.s32 %r125, %r124, %r67, %r20;
- mul.wide.s32 %rd21, %r125, 4;
- add.s64 %rd20, %rd4, %rd21;
-
- ld.global.cs.v4.u32 {%r238,%r237,%r236,%r235}, [%rd20];
+ add.s32 %r116, %r15, %r18;
+ add.s32 %r117, %r116, %r21;
+ mad.lo.s32 %r118, %r117, %r67, %r20;
+ mul.wide.s32 %rd23, %r118, 4;
+ add.s64 %rd22, %rd4, %rd23;
+
+ ld.global.cs.v4.u32 {%r261,%r260,%r259,%r258}, [%rd22];
$L__BB0_12:
- mov.f32 %f225, 0f00000000;
- mov.f32 %f226, 0f00000000;
- mov.f32 %f227, 0f00000000;
- mov.f32 %f228, 0f00000000;
+ mov.f32 %f237, 0f00000000;
+ mov.f32 %f238, 0f00000000;
+ mov.f32 %f239, 0f00000000;
+ mov.f32 %f240, 0f00000000;
@%p8 bra $L__BB0_15;
- div.s32 %r126, %r14, %r15;
- shl.b32 %r30, %r126, 5;
- add.s32 %r127, %r19, %r30;
- mov.u32 %r128, -16;
- sub.s32 %r129, %r128, %r16;
- setp.ge.s32 %p11, %r127, %r129;
+ div.s32 %r119, %r13, %r14;
+ shl.b32 %r30, %r119, 5;
+ add.s32 %r120, %r19, %r30;
+ mov.u32 %r121, -16;
+ sub.s32 %r122, %r121, %r15;
+ setp.ge.s32 %p11, %r120, %r122;
@%p11 bra $L__BB0_15;
- add.s32 %r134, %r16, %r18;
- add.s32 %r135, %r134, %r30;
- add.s32 %r136, %r135, 16;
- mad.lo.s32 %r137, %r136, %r67, %r20;
- mul.wide.s32 %rd23, %r137, 4;
- add.s64 %rd22, %rd4, %rd23;
-
- ld.global.cs.v4.u32 {%r130,%r131,%r132,%r133}, [%rd22];
-
- mov.b32 %f228, %r130;
- mov.b32 %f227, %r131;
- mov.b32 %f226, %r132;
- mov.b32 %f225, %r133;
+ add.s32 %r127, %r15, %r18;
+ add.s32 %r128, %r127, %r30;
+ add.s32 %r129, %r128, 16;
+ mad.lo.s32 %r130, %r129, %r67, %r20;
+ mul.wide.s32 %rd25, %r130, 4;
+ add.s64 %rd24, %rd4, %rd25;
+
+ ld.global.cs.v4.u32 {%r123,%r124,%r125,%r126}, [%rd24];
+
+ mov.b32 %f240, %r123;
+ mov.b32 %f239, %r124;
+ mov.b32 %f238, %r125;
+ mov.b32 %f237, %r126;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
- div.s32 %r138, %r14, %r15;
- shl.b32 %r139, %r138, 5;
- add.s32 %r32, %r139, %r2;
+ div.s32 %r131, %r13, %r14;
+ shl.b32 %r132, %r131, 5;
+ add.s32 %r32, %r132, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
- mul.wide.s32 %rd24, %r32, 4;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f32 %f91, [%rd25];
+ mul.wide.s32 %rd26, %r32, 4;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f32 %f91, [%rd27];
st.shared.f32 [%rd6+4096], %f91;
$L__BB0_18:
- mov.u32 %r239, 0;
- mov.u32 %r240, %r239;
- mov.u32 %r241, %r239;
- mov.u32 %r242, %r239;
+ mov.u32 %r262, 0;
+ mov.u32 %r263, %r262;
+ mov.u32 %r264, %r262;
+ mov.u32 %r265, %r262;
@%p8 bra $L__BB0_21;
- div.s32 %r148, %r14, %r15;
- shl.b32 %r33, %r148, 5;
- add.s32 %r149, %r19, %r33;
- neg.s32 %r150, %r31;
- setp.ge.s32 %p15, %r149, %r150;
+ div.s32 %r141, %r13, %r14;
+ shl.b32 %r33, %r141, 5;
+ add.s32 %r142, %r19, %r33;
+ neg.s32 %r143, %r31;
+ setp.ge.s32 %p15, %r142, %r143;
@%p15 bra $L__BB0_21;
- add.s32 %r155, %r31, %r18;
- add.s32 %r156, %r155, %r33;
- mad.lo.s32 %r157, %r156, %r67, %r20;
- mul.wide.s32 %rd27, %r157, 4;
- add.s64 %rd26, %rd3, %rd27;
-
- ld.global.cs.v4.u32 {%r242,%r241,%r240,%r239}, [%rd26];
+ add.s32 %r148, %r31, %r18;
+ add.s32 %r149, %r148, %r33;
+ mad.lo.s32 %r150, %r149, %r67, %r20;
+ mul.wide.s32 %rd29, %r150, 4;
+ add.s64 %rd28, %rd3, %rd29;
+
+ ld.global.cs.v4.u32 {%r265,%r264,%r263,%r262}, [%rd28];
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r67;
- mov.f32 %f229, 0f00000000;
- mov.f32 %f230, 0f00000000;
- mov.f32 %f231, 0f00000000;
- mov.f32 %f232, 0f00000000;
+ mov.f32 %f241, 0f00000000;
+ mov.f32 %f242, 0f00000000;
+ mov.f32 %f243, 0f00000000;
+ mov.f32 %f244, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
- div.s32 %r158, %r14, %r15;
- shl.b32 %r42, %r158, 5;
- add.s32 %r159, %r19, %r42;
- mov.u32 %r160, -16;
- sub.s32 %r161, %r160, %r31;
- setp.ge.s32 %p17, %r159, %r161;
+ div.s32 %r151, %r13, %r14;
+ shl.b32 %r42, %r151, 5;
+ add.s32 %r152, %r19, %r42;
+ mov.u32 %r153, -16;
+ sub.s32 %r154, %r153, %r31;
+ setp.ge.s32 %p17, %r152, %r154;
@%p17 bra $L__BB0_24;
- add.s32 %r166, %r31, %r18;
- add.s32 %r167, %r166, %r42;
- add.s32 %r168, %r167, 16;
- mad.lo.s32 %r169, %r168, %r67, %r20;
- mul.wide.s32 %rd29, %r169, 4;
- add.s64 %rd28, %rd3, %rd29;
-
- ld.global.cs.v4.u32 {%r162,%r163,%r164,%r165}, [%rd28];
-
- mov.b32 %f232, %r162;
- mov.b32 %f231, %r163;
- mov.b32 %f230, %r164;
- mov.b32 %f229, %r165;
+ add.s32 %r159, %r31, %r18;
+ add.s32 %r160, %r159, %r42;
+ add.s32 %r161, %r160, 16;
+ mad.lo.s32 %r162, %r161, %r67, %r20;
+ mul.wide.s32 %rd31, %r162, 4;
+ add.s64 %rd30, %rd3, %rd31;
+
+ ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd30];
+
+ mov.b32 %f244, %r155;
+ mov.b32 %f243, %r156;
+ mov.b32 %f242, %r157;
+ mov.b32 %f241, %r158;
$L__BB0_24:
- div.s32 %r170, %r14, %r15;
- shl.b32 %r43, %r170, 5;
- shr.u32 %r172, %r104, 27;
- add.s32 %r173, %r2, %r172;
- and.b32 %r174, %r173, -32;
- sub.s32 %r44, %r2, %r174;
- add.s32 %r175, %r43, %r44;
- setp.gt.s32 %p18, %r175, 215;
- mul.lo.s32 %r176, %r175, %r62;
- mul.wide.s32 %rd30, %r176, 4;
- add.s64 %rd9, %rd1, %rd30;
+ div.s32 %r163, %r13, %r14;
+ shl.b32 %r43, %r163, 5;
+ shr.u32 %r165, %r98, 27;
+ add.s32 %r166, %r2, %r165;
+ and.b32 %r167, %r166, -32;
+ sub.s32 %r168, %r2, %r167;
+ add.s32 %r169, %r43, %r168;
+ setp.gt.s32 %p18, %r169, 215;
+ mul.lo.s32 %r170, %r169, %r62;
+ mul.wide.s32 %rd32, %r170, 4;
+ add.s64 %rd9, %rd1, %rd32;
@%p18 bra $L__BB0_26;
- mul.wide.s32 %rd31, %r44, 4;
- add.s64 %rd33, %rd17, %rd31;
ld.global.f32 %f100, [%rd9];
- st.shared.f32 [%rd33], %f100;
+ st.shared.f32 [%rd6], %f100;
+ st.shared.f32 [%rd6+512], %f100;
+ st.shared.f32 [%rd6+1024], %f100;
+ st.shared.f32 [%rd6+1536], %f100;
+ st.shared.f32 [%rd6+2048], %f100;
+ st.shared.f32 [%rd6+2560], %f100;
+ st.shared.f32 [%rd6+3072], %f100;
+ st.shared.f32 [%rd6+3584], %f100;
$L__BB0_26:
- shl.b32 %r45, %r4, 9;
+ shl.b32 %r44, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
- neg.s32 %r46, %r45;
- add.s32 %r47, %r19, %r43;
- setp.ge.s32 %p19, %r47, %r46;
- mul.wide.s32 %rd34, %r18, 4;
- add.s64 %rd10, %rd17, %rd34;
- mov.f32 %f234, 0f00000000;
- mov.f32 %f233, %f234;
+ neg.s32 %r45, %r44;
+ add.s32 %r46, %r19, %r43;
+ setp.ge.s32 %p19, %r46, %r45;
+ shr.s32 %r171, %r18, 31;
+ shr.u32 %r172, %r171, 25;
+ add.s32 %r173, %r18, %r172;
+ and.b32 %r174, %r173, -128;
+ sub.s32 %r175, %r18, %r174;
+ mul.wide.s32 %rd33, %r175, 4;
+ add.s64 %rd35, %rd19, %rd33;
+ add.s64 %rd10, %rd35, 4096;
+ mov.f32 %f246, 0f00000000;
+ mov.f32 %f245, %f246;
@%p19 bra $L__BB0_28;
- ld.shared.f32 %f233, [%rd10+4096];
+ ld.shared.f32 %f245, [%rd10];
$L__BB0_28:
- mov.b32 %f103, %r242;
- sub.f32 %f20, %f103, %f233;
+ mov.b32 %f103, %r265;
+ sub.f32 %f20, %f103, %f245;
@%p19 bra $L__BB0_30;
- ld.shared.f32 %f234, [%rd10+4608];
+ ld.shared.f32 %f246, [%rd10+512];
$L__BB0_30:
- mov.b32 %f105, %r238;
- mul.f32 %f106, %f234, %f105;
+ mov.b32 %f105, %r261;
+ mul.f32 %f106, %f246, %f105;
sub.f32 %f23, %f20, %f106;
- mov.f32 %f236, 0f00000000;
- mov.f32 %f235, %f236;
+ shl.b32 %r176, %r16, 7;
+ add.s32 %r177, %r176, %r18;
+ mul.wide.s32 %rd36, %r177, 4;
+ add.s64 %rd11, %rd19, %rd36;
+ mov.f32 %f248, 0f00000000;
+ mov.f32 %f247, %f248;
@%p19 bra $L__BB0_32;
- ld.shared.f32 %f107, [%rd10];
- mul.f32 %f235, %f107, %f17;
+ ld.shared.f32 %f107, [%rd11];
+ mul.f32 %f247, %f107, %f17;
$L__BB0_32:
- mul.f32 %f108, %f23, %f235;
+ mul.f32 %f108, %f23, %f247;
{ cvt.rn.f16.f32 %rs13, %f108;}
@%p19 bra $L__BB0_34;
- ld.shared.f32 %f236, [%rd10+4096];
+ ld.shared.f32 %f248, [%rd10];
$L__BB0_34:
- mov.b32 %f111, %r241;
- sub.f32 %f28, %f111, %f236;
- mov.f32 %f238, 0f00000000;
- mov.f32 %f237, %f238;
+ mov.b32 %f111, %r264;
+ sub.f32 %f28, %f111, %f248;
+ mov.f32 %f250, 0f00000000;
+ mov.f32 %f249, %f250;
@%p19 bra $L__BB0_36;
- ld.shared.f32 %f237, [%rd10+4608];
+ ld.shared.f32 %f249, [%rd10+512];
$L__BB0_36:
- mov.b32 %f113, %r237;
- mul.f32 %f114, %f237, %f113;
+ mov.b32 %f113, %r260;
+ mul.f32 %f114, %f249, %f113;
sub.f32 %f31, %f28, %f114;
@%p19 bra $L__BB0_38;
- ld.shared.f32 %f115, [%rd10];
- mul.f32 %f238, %f115, %f17;
+ ld.shared.f32 %f115, [%rd11+128];
+ mul.f32 %f250, %f115, %f17;
$L__BB0_38:
- mul.f32 %f116, %f31, %f238;
+ mul.f32 %f116, %f31, %f250;
{ cvt.rn.f16.f32 %rs14, %f116;}
- mov.f32 %f240, 0f00000000;
- mov.f32 %f239, %f240;
+ mov.f32 %f252, 0f00000000;
+ mov.f32 %f251, %f252;
@%p19 bra $L__BB0_40;
- ld.shared.f32 %f239, [%rd10+4096];
+ ld.shared.f32 %f251, [%rd10];
$L__BB0_40:
- mov.b32 %f119, %r240;
- sub.f32 %f36, %f119, %f239;
+ mov.b32 %f119, %r263;
+ sub.f32 %f36, %f119, %f251;
@%p19 bra $L__BB0_42;
- ld.shared.f32 %f240, [%rd10+4608];
+ ld.shared.f32 %f252, [%rd10+512];
$L__BB0_42:
- mov.b32 %f121, %r236;
- mul.f32 %f122, %f240, %f121;
+ mov.b32 %f121, %r259;
+ mul.f32 %f122, %f252, %f121;
sub.f32 %f39, %f36, %f122;
- mov.f32 %f242, 0f00000000;
- mov.f32 %f241, %f242;
+ mov.f32 %f254, 0f00000000;
+ mov.f32 %f253, %f254;
@%p19 bra $L__BB0_44;
- ld.shared.f32 %f123, [%rd10];
- mul.f32 %f241, %f123, %f17;
+ ld.shared.f32 %f123, [%rd11+256];
+ mul.f32 %f253, %f123, %f17;
$L__BB0_44:
- mul.f32 %f124, %f39, %f241;
+ mul.f32 %f124, %f39, %f253;
{ cvt.rn.f16.f32 %rs15, %f124;}
@%p19 bra $L__BB0_46;
- ld.shared.f32 %f242, [%rd10+4096];
+ ld.shared.f32 %f254, [%rd10];
$L__BB0_46:
- mov.b32 %f127, %r239;
- sub.f32 %f44, %f127, %f242;
- mov.f32 %f244, 0f00000000;
- mov.f32 %f243, %f244;
+ mov.b32 %f127, %r262;
+ sub.f32 %f44, %f127, %f254;
+ mov.f32 %f256, 0f00000000;
+ mov.f32 %f255, %f256;
@%p19 bra $L__BB0_48;
- ld.shared.f32 %f243, [%rd10+4608];
+ ld.shared.f32 %f255, [%rd10+512];
$L__BB0_48:
- mov.b32 %f129, %r235;
- mul.f32 %f130, %f243, %f129;
+ mov.b32 %f129, %r258;
+ mul.f32 %f130, %f255, %f129;
sub.f32 %f47, %f44, %f130;
@%p19 bra $L__BB0_50;
- ld.shared.f32 %f131, [%rd10];
- mul.f32 %f244, %f131, %f17;
+ ld.shared.f32 %f131, [%rd11+384];
+ mul.f32 %f256, %f131, %f17;
$L__BB0_50:
- mul.f32 %f132, %f47, %f244;
+ mul.f32 %f132, %f47, %f256;
{ cvt.rn.f16.f32 %rs16, %f132;}
- mov.u32 %r177, -16;
- sub.s32 %r48, %r177, %r45;
- setp.ge.s32 %p31, %r47, %r48;
- mov.f32 %f246, 0f00000000;
- mov.f32 %f245, %f246;
+ mov.u32 %r178, -16;
+ sub.s32 %r47, %r178, %r44;
+ setp.ge.s32 %p31, %r46, %r47;
+ add.s32 %r48, %r18, 16;
+ shr.s32 %r179, %r48, 31;
+ shr.u32 %r180, %r179, 25;
+ add.s32 %r181, %r48, %r180;
+ and.b32 %r182, %r181, -128;
+ sub.s32 %r183, %r48, %r182;
+ mul.wide.s32 %rd38, %r183, 4;
+ add.s64 %rd40, %rd19, %rd38;
+ add.s64 %rd12, %rd40, 4096;
+ mov.f32 %f258, 0f00000000;
+ mov.f32 %f257, %f258;
@%p31 bra $L__BB0_52;
- ld.shared.f32 %f245, [%rd10+4160];
+ ld.shared.f32 %f257, [%rd12];
$L__BB0_52:
- sub.f32 %f52, %f232, %f245;
+ sub.f32 %f52, %f244, %f257;
@%p31 bra $L__BB0_54;
- ld.shared.f32 %f246, [%rd10+4672];
+ ld.shared.f32 %f258, [%rd12+512];
$L__BB0_54:
- mul.f32 %f136, %f246, %f228;
+ mul.f32 %f136, %f258, %f240;
sub.f32 %f55, %f52, %f136;
- mov.f32 %f248, 0f00000000;
- mov.f32 %f247, %f248;
+ mov.f32 %f260, 0f00000000;
+ mov.f32 %f259, %f260;
@%p31 bra $L__BB0_56;
- ld.shared.f32 %f137, [%rd10+64];
- mul.f32 %f247, %f137, %f17;
+ ld.shared.f32 %f137, [%rd11+64];
+ mul.f32 %f259, %f137, %f17;
$L__BB0_56:
- mul.f32 %f138, %f55, %f247;
+ mul.f32 %f138, %f55, %f259;
{ cvt.rn.f16.f32 %rs17, %f138;}
@%p31 bra $L__BB0_58;
- ld.shared.f32 %f248, [%rd10+4160];
+ ld.shared.f32 %f260, [%rd12];
$L__BB0_58:
- sub.f32 %f60, %f231, %f248;
- mov.f32 %f250, 0f00000000;
- mov.f32 %f249, %f250;
+ sub.f32 %f60, %f243, %f260;
+ mov.f32 %f262, 0f00000000;
+ mov.f32 %f261, %f262;
@%p31 bra $L__BB0_60;
- ld.shared.f32 %f249, [%rd10+4672];
+ ld.shared.f32 %f261, [%rd12+512];
$L__BB0_60:
- mul.f32 %f142, %f249, %f227;
+ mul.f32 %f142, %f261, %f239;
sub.f32 %f63, %f60, %f142;
@%p31 bra $L__BB0_62;
- ld.shared.f32 %f143, [%rd10+64];
- mul.f32 %f250, %f143, %f17;
+ ld.shared.f32 %f143, [%rd11+192];
+ mul.f32 %f262, %f143, %f17;
$L__BB0_62:
- mul.f32 %f144, %f63, %f250;
+ mul.f32 %f144, %f63, %f262;
{ cvt.rn.f16.f32 %rs18, %f144;}
- mov.f32 %f252, 0f00000000;
- mov.f32 %f251, %f252;
+ mov.f32 %f264, 0f00000000;
+ mov.f32 %f263, %f264;
@%p31 bra $L__BB0_64;
- ld.shared.f32 %f251, [%rd10+4160];
+ ld.shared.f32 %f263, [%rd12];
$L__BB0_64:
- sub.f32 %f68, %f230, %f251;
+ sub.f32 %f68, %f242, %f263;
@%p31 bra $L__BB0_66;
- ld.shared.f32 %f252, [%rd10+4672];
+ ld.shared.f32 %f264, [%rd12+512];
$L__BB0_66:
- mul.f32 %f148, %f252, %f226;
+ mul.f32 %f148, %f264, %f238;
sub.f32 %f71, %f68, %f148;
- mov.f32 %f254, 0f00000000;
- mov.f32 %f253, %f254;
+ mov.f32 %f266, 0f00000000;
+ mov.f32 %f265, %f266;
@%p31 bra $L__BB0_68;
- ld.shared.f32 %f149, [%rd10+64];
- mul.f32 %f253, %f149, %f17;
+ ld.shared.f32 %f149, [%rd11+320];
+ mul.f32 %f265, %f149, %f17;
$L__BB0_68:
- mul.f32 %f150, %f71, %f253;
+ mul.f32 %f150, %f71, %f265;
{ cvt.rn.f16.f32 %rs19, %f150;}
@%p31 bra $L__BB0_70;
- ld.shared.f32 %f254, [%rd10+4160];
+ ld.shared.f32 %f266, [%rd12];
$L__BB0_70:
- sub.f32 %f76, %f229, %f254;
- mov.f32 %f256, 0f00000000;
- mov.f32 %f255, %f256;
+ sub.f32 %f76, %f241, %f266;
+ mov.f32 %f268, 0f00000000;
+ mov.f32 %f267, %f268;
@%p31 bra $L__BB0_72;
- ld.shared.f32 %f255, [%rd10+4672];
+ ld.shared.f32 %f267, [%rd12+512];
$L__BB0_72:
- mul.f32 %f154, %f255, %f225;
+ mul.f32 %f154, %f267, %f237;
sub.f32 %f79, %f76, %f154;
@%p31 bra $L__BB0_74;
- ld.shared.f32 %f155, [%rd10+64];
- mul.f32 %f256, %f155, %f17;
+ ld.shared.f32 %f155, [%rd11+448];
+ mul.f32 %f268, %f155, %f17;
$L__BB0_74:
- mul.f32 %f156, %f79, %f256;
+ mul.f32 %f156, %f79, %f268;
{ cvt.rn.f16.f32 %rs20, %f156;}
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
- neg.s32 %r178, %r49;
- setp.ge.s32 %p44, %r47, %r178;
+ neg.s32 %r184, %r49;
+ setp.ge.s32 %p44, %r46, %r184;
@%p44 bra $L__BB0_77;
- add.s32 %r181, %r49, %r18;
- add.s32 %r182, %r181, %r43;
- mad.lo.s32 %r183, %r182, %r67, %r20;
- mul.wide.s32 %rd37, %r183, 2;
- add.s64 %rd36, %rd13, %rd37;
- mov.b32 %r180, {%rs15, %rs16};
- mov.b32 %r179, {%rs13, %rs14};
-
- st.global.cs.v2.s32 [%rd36], {%r179,%r180};
+ add.s32 %r187, %r49, %r18;
+ add.s32 %r188, %r187, %r43;
+ mad.lo.s32 %r189, %r188, %r67, %r20;
+ mul.wide.s32 %rd42, %r189, 2;
+ add.s64 %rd41, %rd15, %rd42;
+ mov.b32 %r186, {%rs15, %rs16};
+ mov.b32 %r185, {%rs13, %rs14};
+
+ st.global.cs.v2.s32 [%rd41], {%r185,%r186};
$L__BB0_77:
- mov.u32 %r184, -16;
- sub.s32 %r185, %r184, %r49;
- setp.ge.s32 %p46, %r47, %r185;
+ mov.u32 %r190, -16;
+ sub.s32 %r191, %r190, %r49;
+ setp.ge.s32 %p46, %r46, %r191;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
- add.s32 %r188, %r49, %r18;
- add.s32 %r189, %r188, %r43;
- add.s32 %r190, %r189, 16;
- mad.lo.s32 %r191, %r190, %r67, %r20;
- mul.wide.s32 %rd39, %r191, 2;
- add.s64 %rd38, %rd13, %rd39;
- mov.b32 %r187, {%rs19, %rs20};
- mov.b32 %r186, {%rs17, %rs18};
-
- st.global.cs.v2.s32 [%rd38], {%r186,%r187};
+ add.s32 %r194, %r48, %r49;
+ add.s32 %r195, %r194, %r43;
+ mad.lo.s32 %r196, %r195, %r67, %r20;
+ mul.wide.s32 %rd44, %r196, 2;
+ add.s64 %rd43, %rd15, %rd44;
+ mov.b32 %r193, {%rs19, %rs20};
+ mov.b32 %r192, {%rs17, %rs18};
+
+ st.global.cs.v2.s32 [%rd43], {%r192,%r193};
$L__BB0_80:
ret;
19: CombinedSchedulerTest.LayerNormBackward/dtype___bfloat_batch_216_hidden_32
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 64
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<631>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
shl.b32 %r248, %r4, 2;
max.s32 %r249, %r2, %r3;
mad.lo.s32 %r250, %r248, %r249, 15;
and.b32 %r251, %r250, -16;
cvt.u64.u32 %rd2, %r251;
mov.u64 %rd43, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r252, %r8, 7;
setp.lt.s32 %p11, %r252, %r202;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
// end inline asm
shl.b32 %r256, %r5, 4;
add.s32 %r254, %r253, %r256;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r255, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r255, 0;
cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r583, %r6, 4;
add.s32 %r257, %r4, 215;
div.s32 %r258, %r257, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r259, %r11, %r258;
add.s32 %r260, %r259, -1;
div.s32 %r12, %r260, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r262, %ctaid.y;
mul.lo.s32 %r263, %r12, %r4;
mul.lo.s32 %r13, %r263, %r262;
shl.b32 %r264, %r9, 1;
mov.u32 %r265, 1;
shl.b32 %r266, %r5, 4;
mad.lo.s32 %r14, %r264, %r202, %r266;
mul.lo.s32 %r267, %r202, %r9;
cvt.s64.s32 %rd52, %r267;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r268, %r13, %r202;
cvt.s64.s32 %rd6, %r268;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r269, %tid.z;
mad.lo.s32 %r270, %r4, %r269, %r9;
mad.lo.s32 %r15, %r270, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r271, %r3;
mov.u32 %r272, 31;
sub.s32 %r273, %r272, %r271;
shl.b32 %r16, %r265, %r273;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r274, %r16, %r5;
setp.lt.u32 %p18, %r274, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r275, %r15, %r16;
mul.wide.s32 %rd55, %r275, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r276, %r16, 31;
add.s32 %r277, %r16, %r276;
shr.s32 %r17, %r277, 1;
add.s32 %r18, %r267, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r18, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r270, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r580, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r14, %r281;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r14, %r284;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r580, %r4;
add.s32 %r279, %r23, %r9;
add.s32 %r24, %r279, %r13;
setp.gt.s32 %p19, %r24, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r24, %r211;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r24, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r578, %r580, %r4;
mul.lo.s32 %r287, %r578, %r202;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r576, %r580, %r4;
add.s32 %r575, %r576, %r9;
add.s32 %r574, %r575, %r13;
setp.gt.s32 %p204, %r574, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r24, %r215;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ mov.b32 %f221, {0,%rs36};}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ mov.b32 %f222, {0,%rs37};}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ mov.b32 %f223, {0,%rs38};}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ mov.b32 %f224, {0,%rs39};}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ mov.b32 %f225, {0,%rs40};}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ mov.b32 %f226, {0,%rs41};}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ mov.b32 %f227, {0,%rs42};}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ mov.b32 %f228, {0,%rs43};}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ mov.b32 %f229, {0,%rs44};}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ mov.b32 %f230, {0,%rs45};}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ mov.b32 %f231, {0,%rs46};}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ mov.b32 %f232, {0,%rs47};}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ mov.b32 %f233, {0,%rs48};}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ mov.b32 %f234, {0,%rs49};}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ mov.b32 %f235, {0,%rs50};}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ mov.b32 %f236, {0,%rs51};}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ mov.b32 %f237, {0,%rs52};}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ mov.b32 %f238, {0,%rs53};}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ mov.b32 %f239, {0,%rs54};}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ mov.b32 %f240, {0,%rs55};}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ mov.b32 %f241, {0,%rs56};}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ mov.b32 %f242, {0,%rs57};}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ mov.b32 %f243, {0,%rs58};}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ mov.b32 %f244, {0,%rs59};}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r583, %r583, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r581, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r581;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r581, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r36, %r581, 1;
setp.gt.u32 %p27, %r581, 3;
mov.u32 %r581, %r36;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r582, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r582;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r582, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r38, %r582, 1;
setp.gt.u32 %p33, %r582, 3;
mov.u32 %r582, %r38;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ mov.b32 %f338, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ mov.b32 %f339, {0,%rs98};}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ mov.b32 %f340, {0,%rs99};}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ mov.b32 %f342, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f343, {0,%rs102};}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ mov.b32 %f344, {0,%rs103};}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ mov.b32 %f346, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ mov.b32 %f347, {0,%rs106};}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ mov.b32 %f348, {0,%rs107};}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ mov.b32 %f350, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f351, {0,%rs110};}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ mov.b32 %f352, {0,%rs111};}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ mov.b32 %f354, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ mov.b32 %f355, {0,%rs114};}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ mov.b32 %f356, {0,%rs115};}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ mov.b32 %f358, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f359, {0,%rs118};}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ mov.b32 %f360, {0,%rs119};}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ mov.b32 %f362, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ mov.b32 %f363, {0,%rs122};}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ mov.b32 %f364, {0,%rs123};}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ mov.b32 %f366, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f367, {0,%rs126};}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ mov.b32 %f368, {0,%rs127};}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
add.s32 %r351, %r13, %r577;
mad.lo.s32 %r352, %r351, %r202, %r18;
mul.wide.s32 %rd82, %r352, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r580, %r580, 1;
setp.lt.s32 %p39, %r580, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r353, %tid.z;
mad.lo.s32 %r354, %r4, %r353, %r9;
mad.lo.s32 %r50, %r354, %r3, %r5;
mul.wide.u32 %rd83, %r50, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r355, %r4;
mov.u32 %r356, 31;
sub.s32 %r51, %r356, %r355;
mov.u32 %r357, 1;
shl.b32 %r614, %r357, %r51;
setp.lt.u32 %p40, %r9, %r614;
add.s32 %r358, %r614, %r9;
setp.lt.u32 %p41, %r358, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r359, %r3, %r51;
add.s32 %r360, %r50, %r359;
mul.wide.s32 %rd85, %r360, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r614, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r584, %r614;
$L__BB0_48:
shr.u32 %r54, %r584, 1;
setp.ge.u32 %p44, %r9, %r54;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r361, %r54, %r3, %r50;
mul.wide.s32 %rd88, %r361, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r584, 7;
mov.u32 %r584, %r54;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r585, 0;
add.s32 %r363, %r50, %r3;
mul.wide.u32 %rd91, %r363, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r585, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r364, %r3, %r51;
add.s32 %r365, %r50, %r364;
mul.wide.s32 %rd93, %r365, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r586, %r614;
$L__BB0_59:
shr.u32 %r58, %r586, 1;
setp.ge.u32 %p50, %r9, %r58;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r366, %r58, %r3, %r50;
mul.wide.s32 %rd96, %r366, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r586, 7;
mov.u32 %r586, %r58;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r587, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r368, %r3, %r51;
add.s32 %r369, %r50, %r368;
mul.wide.s32 %rd99, %r369, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r588, %r614;
$L__BB0_70:
shr.u32 %r62, %r588, 1;
setp.ge.u32 %p56, %r9, %r62;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r370, %r62, %r3, %r50;
mul.wide.s32 %rd102, %r370, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r588, 7;
mov.u32 %r588, %r62;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r589, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r372, %r3, %r51;
add.s32 %r373, %r50, %r372;
mul.wide.s32 %rd105, %r373, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r590, %r614;
$L__BB0_81:
shr.u32 %r66, %r590, 1;
setp.ge.u32 %p62, %r9, %r66;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r374, %r66, %r3, %r50;
mul.wide.s32 %rd108, %r374, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r590, 7;
mov.u32 %r590, %r66;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r591, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r376, %r3, %r51;
add.s32 %r377, %r50, %r376;
mul.wide.s32 %rd111, %r377, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r592, %r614;
$L__BB0_92:
shr.u32 %r70, %r592, 1;
setp.ge.u32 %p68, %r9, %r70;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r378, %r70, %r3, %r50;
mul.wide.s32 %rd114, %r378, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r592, 7;
mov.u32 %r592, %r70;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r593, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r380, %r3, %r51;
add.s32 %r381, %r50, %r380;
mul.wide.s32 %rd117, %r381, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r594, %r614;
$L__BB0_103:
shr.u32 %r74, %r594, 1;
setp.ge.u32 %p74, %r9, %r74;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r382, %r74, %r3, %r50;
mul.wide.s32 %rd120, %r382, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r594, 7;
mov.u32 %r594, %r74;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r595, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r384, %r3, %r51;
add.s32 %r385, %r50, %r384;
mul.wide.s32 %rd123, %r385, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r596, %r614;
$L__BB0_114:
shr.u32 %r78, %r596, 1;
setp.ge.u32 %p80, %r9, %r78;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r386, %r78, %r3, %r50;
mul.wide.s32 %rd126, %r386, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r596, 7;
mov.u32 %r596, %r78;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r597, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r388, %r3, %r51;
add.s32 %r389, %r50, %r388;
mul.wide.s32 %rd129, %r389, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r598, %r614;
$L__BB0_125:
shr.u32 %r82, %r598, 1;
setp.ge.u32 %p86, %r9, %r82;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r390, %r82, %r3, %r50;
mul.wide.s32 %rd132, %r390, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r598, 7;
mov.u32 %r598, %r82;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r599, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r85, %r583, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r392, %r3, %r51;
add.s32 %r393, %r50, %r392;
mul.wide.s32 %rd135, %r393, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r600, %r614;
$L__BB0_136:
shr.u32 %r87, %r600, 1;
setp.ge.u32 %p92, %r9, %r87;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r394, %r87, %r3, %r50;
mul.wide.s32 %rd138, %r394, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r600, 7;
mov.u32 %r600, %r87;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r601, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r396, %r3, %r51;
add.s32 %r397, %r50, %r396;
mul.wide.s32 %rd141, %r397, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r602, %r614;
$L__BB0_147:
shr.u32 %r91, %r602, 1;
setp.ge.u32 %p98, %r9, %r91;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r398, %r91, %r3, %r50;
mul.wide.s32 %rd144, %r398, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r602, 7;
mov.u32 %r602, %r91;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r603, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r400, %r3, %r51;
add.s32 %r401, %r50, %r400;
mul.wide.s32 %rd147, %r401, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r604, %r614;
$L__BB0_158:
shr.u32 %r95, %r604, 1;
setp.ge.u32 %p104, %r9, %r95;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r402, %r95, %r3, %r50;
mul.wide.s32 %rd150, %r402, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r604, 7;
mov.u32 %r604, %r95;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r605, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r404, %r3, %r51;
add.s32 %r405, %r50, %r404;
mul.wide.s32 %rd153, %r405, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r606, %r614;
$L__BB0_169:
shr.u32 %r99, %r606, 1;
setp.ge.u32 %p110, %r9, %r99;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r406, %r99, %r3, %r50;
mul.wide.s32 %rd156, %r406, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r606, 7;
mov.u32 %r606, %r99;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r607, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r408, %r3, %r51;
add.s32 %r409, %r50, %r408;
mul.wide.s32 %rd159, %r409, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r608, %r614;
$L__BB0_180:
shr.u32 %r103, %r608, 1;
setp.ge.u32 %p116, %r9, %r103;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r410, %r103, %r3, %r50;
mul.wide.s32 %rd162, %r410, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r608, 7;
mov.u32 %r608, %r103;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r609, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r412, %r3, %r51;
add.s32 %r413, %r50, %r412;
mul.wide.s32 %rd165, %r413, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r610, %r614;
$L__BB0_191:
shr.u32 %r107, %r610, 1;
setp.ge.u32 %p122, %r9, %r107;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r414, %r107, %r3, %r50;
mul.wide.s32 %rd168, %r414, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r610, 7;
mov.u32 %r610, %r107;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r611, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r611, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r416, %r3, %r51;
add.s32 %r417, %r50, %r416;
mul.wide.s32 %rd171, %r417, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r612, %r614;
$L__BB0_202:
shr.u32 %r111, %r612, 1;
setp.ge.u32 %p128, %r9, %r111;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r418, %r111, %r3, %r50;
mul.wide.s32 %rd174, %r418, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r612, 7;
mov.u32 %r612, %r111;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r613, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r613, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r420, %r3, %r51;
add.s32 %r421, %r50, %r420;
mul.wide.s32 %rd177, %r421, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r115, %r614, 1;
setp.ge.u32 %p134, %r9, %r115;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r422, %r115, %r3, %r50;
mul.wide.s32 %rd180, %r422, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r614, 7;
mov.u32 %r614, %r115;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r615, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r615, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
shl.b32 %r573, %r5, 3;
mov.u32 %r448, %ctaid.y;
mad.lo.s32 %r449, %r202, %r448, %r573;
add.s32 %r450, %r449, %r85;
mul.wide.s32 %rd189, %r450, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
// end inline asm
add.s32 %r451, %r450, 4;
mul.wide.s32 %rd190, %r451, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r424, %r570, 3;
sub.s32 %r118, %r424, %r202;
mov.u32 %r425, %ctaid.y;
mad.lo.s32 %r119, %r202, %r425, %r570;
neg.s32 %r426, %r85;
setp.ge.s32 %p141, %r118, %r426;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r431, %r119, %r85;
mul.wide.s32 %rd184, %r431, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
// end inline asm
$L__BB0_222:
mov.u32 %r432, -4;
sub.s32 %r433, %r432, %r85;
setp.ge.s32 %p143, %r118, %r433;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r438, %r119, %r85;
add.s32 %r439, %r438, 4;
mul.wide.s32 %rd186, %r439, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
// end inline asm
$L__BB0_226:
shl.b32 %r120, %r583, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
shl.b32 %r572, %r5, 3;
mov.u32 %r476, %ctaid.y;
mad.lo.s32 %r477, %r202, %r476, %r572;
add.s32 %r478, %r477, %r120;
mul.wide.s32 %rd197, %r478, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
// end inline asm
add.s32 %r479, %r478, 4;
mul.wide.s32 %rd198, %r479, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r452, %r571, 3;
sub.s32 %r121, %r452, %r202;
mov.u32 %r453, %ctaid.y;
mad.lo.s32 %r122, %r202, %r453, %r571;
neg.s32 %r454, %r120;
setp.ge.s32 %p150, %r121, %r454;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r459, %r122, %r120;
mul.wide.s32 %rd192, %r459, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
// end inline asm
$L__BB0_229:
mov.u32 %r460, -4;
sub.s32 %r461, %r460, %r120;
setp.ge.s32 %p152, %r121, %r461;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r466, %r122, %r120;
add.s32 %r467, %r466, 4;
mul.wide.s32 %rd194, %r467, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
// end inline asm
$L__BB0_233:
mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r480, %r5, %r9;
or.b32 %r482, %r480, %r353;
setp.ne.s32 %p156, %r482, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r483, %ctaid.x;
mov.u32 %r484, %ctaid.z;
mov.u32 %r485, %nctaid.x;
mad.lo.s32 %r486, %r484, %r485, %r483;
mul.wide.s32 %rd200, %r486, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r487, %r11, -1;
setp.eq.s32 %p157, %r123, %r487;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r616, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r616;
// end inline asm
setp.lt.u32 %p159, %r616, 256;
selp.u32 %r490, 1, 0, %p159;
shl.b32 %r616, %r616, %r490;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r491, %r11, %r3;
add.s32 %r492, %r491, -1;
div.s32 %r126, %r492, %r3;
setp.lt.s32 %p161, %r126, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r494, %r202, 1;
shr.u32 %r495, %r494, 31;
add.s32 %r496, %r494, %r495;
shr.s32 %r497, %r496, 1;
add.s32 %r498, %r4, %r497;
add.s32 %r499, %r498, -1;
shl.b32 %r500, %r9, 1;
shl.b32 %r501, %r4, 1;
mad.lo.s32 %r502, %r501, %r123, %r500;
or.b32 %r503, %r502, 1;
setp.ge.s32 %p162, %r503, %r202;
div.s32 %r504, %r499, %r4;
setp.ge.s32 %p163, %r123, %r504;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r505, %r4, %r123;
shl.b32 %r506, %r505, 1;
mad.lo.s32 %r507, %r202, %r5, %r506;
add.s32 %r618, %r507, %r500;
mul.lo.s32 %r128, %r202, %r3;
mov.u32 %r493, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r617, %r5;
mov.u32 %r619, %r493;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r617, %r11;
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r618, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r621;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r620;
add.f32 %f678, %f678, %f559;
add.s32 %r618, %r618, %r128;
add.s32 %r617, %r617, %r3;
add.s32 %r619, %r619, 1;
setp.lt.s32 %p165, %r619, %r126;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r514, %r3;
mov.u32 %r515, 31;
sub.s32 %r516, %r515, %r514;
mov.u32 %r517, 1;
shl.b32 %r139, %r517, %r516;
setp.lt.u32 %p166, %r5, %r139;
add.s32 %r518, %r139, %r5;
setp.lt.u32 %p167, %r518, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r519, %r50, %r139;
mul.wide.s32 %rd211, %r519, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r520, %r139, 31;
add.s32 %r521, %r139, %r520;
shr.s32 %r630, %r521, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r139, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r622, %r630;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r622;
@%p170 bra $L__BB0_249;
add.s32 %r522, %r622, %r50;
mul.wide.s32 %rd213, %r522, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r142, %r622, 1;
setp.gt.u32 %p171, %r622, 3;
mov.u32 %r622, %r142;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r523, %r50, 1;
mul.wide.u32 %rd216, %r523, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r623, %r630;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r623;
@%p176 bra $L__BB0_259;
add.s32 %r524, %r623, %r50;
mul.wide.s32 %rd218, %r524, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r144, %r623, 1;
setp.gt.u32 %p177, %r623, 3;
mov.u32 %r623, %r144;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r525, %r202, 1;
shr.u32 %r526, %r525, 31;
add.s32 %r527, %r525, %r526;
shr.s32 %r528, %r527, 1;
add.s32 %r529, %r4, %r528;
add.s32 %r530, %r529, -1;
div.s32 %r531, %r530, %r4;
setp.ge.s32 %p181, %r123, %r531;
@%p181 bra $L__BB0_267;
shl.b32 %r145, %r9, 1;
mul.lo.s32 %r532, %r4, %r123;
shl.b32 %r146, %r532, 1;
add.s32 %r533, %r145, %r146;
or.b32 %r534, %r533, 1;
setp.ge.s32 %p182, %r534, %r202;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r535, %r146, %r145;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r535, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r537, %r202, 1;
shr.u32 %r538, %r537, 31;
add.s32 %r539, %r537, %r538;
shr.s32 %r540, %r539, 1;
add.s32 %r541, %r4, %r540;
add.s32 %r542, %r541, -1;
shl.b32 %r543, %r9, 1;
shl.b32 %r544, %r4, 1;
mad.lo.s32 %r545, %r544, %r123, %r543;
or.b32 %r546, %r545, 1;
setp.ge.s32 %p184, %r546, %r202;
div.s32 %r547, %r542, %r4;
setp.ge.s32 %p185, %r123, %r547;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r548, %r4, %r123;
shl.b32 %r549, %r548, 1;
mad.lo.s32 %r550, %r202, %r5, %r549;
add.s32 %r625, %r550, %r543;
mul.lo.s32 %r148, %r202, %r3;
mov.u32 %r536, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r624, %r5;
mov.u32 %r626, %r536;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r624, %r11;
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r625, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r628;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r627;
add.f32 %f684, %f684, %f585;
add.s32 %r625, %r625, %r148;
add.s32 %r624, %r624, %r3;
add.s32 %r626, %r626, 1;
setp.lt.s32 %p187, %r626, %r126;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r629, %r630;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r629;
@%p190 bra $L__BB0_279;
add.s32 %r557, %r629, %r50;
mul.wide.s32 %rd226, %r557, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r160, %r629, 1;
setp.gt.u32 %p191, %r629, 3;
mov.u32 %r629, %r160;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r630;
@%p196 bra $L__BB0_288;
add.s32 %r558, %r630, %r50;
mul.wide.s32 %rd229, %r558, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r162, %r630, 1;
setp.gt.u32 %p197, %r630, 3;
mov.u32 %r630, %r162;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r559, %r202, 1;
shr.u32 %r560, %r559, 31;
add.s32 %r561, %r559, %r560;
shr.s32 %r562, %r561, 1;
add.s32 %r563, %r4, %r562;
add.s32 %r564, %r563, -1;
div.s32 %r565, %r564, %r4;
setp.ge.s32 %p201, %r123, %r565;
@%p201 bra $L__BB0_296;
shl.b32 %r163, %r9, 1;
mul.lo.s32 %r566, %r4, %r123;
shl.b32 %r164, %r566, 1;
add.s32 %r567, %r163, %r164;
or.b32 %r568, %r567, 1;
setp.ge.s32 %p202, %r568, %r202;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_20c09547_1033910nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r569, %r164, %r163;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r569, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r200, %r201}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r210, %r211}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r214, %r215}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r236, %r201, 7;
shr.s32 %r237, %r236, 31;
shr.u32 %r238, %r237, 29;
add.s32 %r239, %r236, %r238;
shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r240, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r242, %r4, %r2;
shl.b32 %r243, %r242, 4;
or.b32 %r244, %r243, 15;
and.b32 %r7, %r244, -16;
add.s32 %r245, %r244, %r7;
and.b32 %r246, %r245, -16;
cvt.s64.s32 %rd1, %r246;
shl.b32 %r247, %r4, 2;
max.s32 %r248, %r2, %r3;
mad.lo.s32 %r249, %r247, %r248, 15;
and.b32 %r250, %r249, -16;
cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r251, %r8, 7;
setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
// end inline asm
shl.b32 %r255, %r5, 4;
add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r254, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r254, 0;
cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r577, %r6, 4;
add.s32 %r256, %r4, 215;
div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r258, %r11, %r257;
add.s32 %r259, %r258, -1;
div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r261, %ctaid.y;
mul.lo.s32 %r262, %r12, %r4;
mul.lo.s32 %r13, %r262, %r261;
mad.lo.s32 %r263, %r2, %r9, %r5;
shl.b32 %r14, %r263, 4;
mul.lo.s32 %r264, %r201, %r9;
cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r265, %r13, %r201;
cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r266, %tid.z;
mad.lo.s32 %r267, %r4, %r266, %r9;
mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r268, %r3;
mov.u32 %r269, 31;
sub.s32 %r270, %r269, %r268;
mov.u32 %r271, 1;
shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r272, %r16, %r5;
setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r273, %r15, %r16;
mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r274, %r16, 31;
add.s32 %r275, %r16, %r274;
shr.s32 %r17, %r275, 1;
shl.b32 %r276, %r9, 3;
mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r281, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r574, %r4;
add.s32 %r279, %r22, %r9;
add.s32 %r23, %r279, %r13;
setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r572, %r574, %r4;
mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r571, %r574, %r4;
add.s32 %r570, %r571, %r9;
add.s32 %r569, %r570, %r13;
setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ mov.b32 %f221, {0,%rs36};}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ mov.b32 %f222, {0,%rs37};}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ mov.b32 %f223, {0,%rs38};}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ mov.b32 %f224, {0,%rs39};}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ mov.b32 %f225, {0,%rs40};}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ mov.b32 %f226, {0,%rs41};}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ mov.b32 %f227, {0,%rs42};}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ mov.b32 %f228, {0,%rs43};}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ mov.b32 %f229, {0,%rs44};}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ mov.b32 %f230, {0,%rs45};}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ mov.b32 %f231, {0,%rs46};}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ mov.b32 %f232, {0,%rs47};}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ mov.b32 %f233, {0,%rs48};}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ mov.b32 %f234, {0,%rs49};}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ mov.b32 %f235, {0,%rs50};}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ mov.b32 %f236, {0,%rs51};}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ mov.b32 %f237, {0,%rs52};}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ mov.b32 %f238, {0,%rs53};}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ mov.b32 %f239, {0,%rs54};}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ mov.b32 %f240, {0,%rs55};}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ mov.b32 %f241, {0,%rs56};}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ mov.b32 %f242, {0,%rs57};}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ mov.b32 %f243, {0,%rs58};}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ mov.b32 %f244, {0,%rs59};}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r575, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r35, %r575, 1;
setp.gt.u32 %p27, %r575, 3;
mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r576, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r37, %r576, 1;
setp.gt.u32 %p33, %r576, 3;
mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ mov.b32 %f338, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ mov.b32 %f339, {0,%rs98};}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ mov.b32 %f340, {0,%rs99};}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ mov.b32 %f342, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f343, {0,%rs102};}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ mov.b32 %f344, {0,%rs103};}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ mov.b32 %f346, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ mov.b32 %f347, {0,%rs106};}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ mov.b32 %f348, {0,%rs107};}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ mov.b32 %f350, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f351, {0,%rs110};}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ mov.b32 %f352, {0,%rs111};}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ mov.b32 %f354, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ mov.b32 %f355, {0,%rs114};}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ mov.b32 %f356, {0,%rs115};}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ mov.b32 %f358, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f359, {0,%rs118};}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ mov.b32 %f360, {0,%rs119};}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ mov.b32 %f362, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ mov.b32 %f363, {0,%rs122};}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ mov.b32 %f364, {0,%rs123};}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ mov.b32 %f366, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f367, {0,%rs126};}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ mov.b32 %f368, {0,%rs127};}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
mad.lo.s32 %r351, %r23, %r201, %r8;
mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r574, %r574, 1;
setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r352, %tid.z;
mad.lo.s32 %r353, %r4, %r352, %r9;
mad.lo.s32 %r49, %r353, %r3, %r5;
mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r354, %r4;
mov.u32 %r355, 31;
sub.s32 %r50, %r355, %r354;
mov.u32 %r356, 1;
shl.b32 %r608, %r356, %r50;
setp.lt.u32 %p40, %r9, %r608;
add.s32 %r357, %r608, %r9;
setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r358, %r3, %r50;
add.s32 %r359, %r49, %r358;
mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r578, %r608;
$L__BB0_48:
shr.u32 %r53, %r578, 1;
setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r360, %r53, %r3, %r49;
mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r578, 7;
mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r579, 0;
add.s32 %r362, %r49, %r3;
mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r363, %r3, %r50;
add.s32 %r364, %r49, %r363;
mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r580, %r608;
$L__BB0_59:
shr.u32 %r57, %r580, 1;
setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r365, %r57, %r3, %r49;
mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r580, 7;
mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r367, %r3, %r50;
add.s32 %r368, %r49, %r367;
mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r582, %r608;
$L__BB0_70:
shr.u32 %r61, %r582, 1;
setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r369, %r61, %r3, %r49;
mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r582, 7;
mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r371, %r3, %r50;
add.s32 %r372, %r49, %r371;
mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r584, %r608;
$L__BB0_81:
shr.u32 %r65, %r584, 1;
setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r373, %r65, %r3, %r49;
mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r584, 7;
mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r375, %r3, %r50;
add.s32 %r376, %r49, %r375;
mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r586, %r608;
$L__BB0_92:
shr.u32 %r69, %r586, 1;
setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r377, %r69, %r3, %r49;
mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r586, 7;
mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r379, %r3, %r50;
add.s32 %r380, %r49, %r379;
mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r588, %r608;
$L__BB0_103:
shr.u32 %r73, %r588, 1;
setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r381, %r73, %r3, %r49;
mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r588, 7;
mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r383, %r3, %r50;
add.s32 %r384, %r49, %r383;
mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r590, %r608;
$L__BB0_114:
shr.u32 %r77, %r590, 1;
setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r385, %r77, %r3, %r49;
mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r590, 7;
mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r387, %r3, %r50;
add.s32 %r388, %r49, %r387;
mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r592, %r608;
$L__BB0_125:
shr.u32 %r81, %r592, 1;
setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r389, %r81, %r3, %r49;
mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r592, 7;
mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r391, %r3, %r50;
add.s32 %r392, %r49, %r391;
mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r594, %r608;
$L__BB0_136:
shr.u32 %r86, %r594, 1;
setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r393, %r86, %r3, %r49;
mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r594, 7;
mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r395, %r3, %r50;
add.s32 %r396, %r49, %r395;
mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r596, %r608;
$L__BB0_147:
shr.u32 %r90, %r596, 1;
setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r397, %r90, %r3, %r49;
mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r596, 7;
mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r399, %r3, %r50;
add.s32 %r400, %r49, %r399;
mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r598, %r608;
$L__BB0_158:
shr.u32 %r94, %r598, 1;
setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r401, %r94, %r3, %r49;
mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r598, 7;
mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r403, %r3, %r50;
add.s32 %r404, %r49, %r403;
mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r600, %r608;
$L__BB0_169:
shr.u32 %r98, %r600, 1;
setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r405, %r98, %r3, %r49;
mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r600, 7;
mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r407, %r3, %r50;
add.s32 %r408, %r49, %r407;
mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r602, %r608;
$L__BB0_180:
shr.u32 %r102, %r602, 1;
setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r409, %r102, %r3, %r49;
mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r602, 7;
mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r411, %r3, %r50;
add.s32 %r412, %r49, %r411;
mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r604, %r608;
$L__BB0_191:
shr.u32 %r106, %r604, 1;
setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r413, %r106, %r3, %r49;
mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r604, 7;
mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r415, %r3, %r50;
add.s32 %r416, %r49, %r415;
mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r606, %r608;
$L__BB0_202:
shr.u32 %r110, %r606, 1;
setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r417, %r110, %r3, %r49;
mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r606, 7;
mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r419, %r3, %r50;
add.s32 %r420, %r49, %r419;
mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r114, %r608, 1;
setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r421, %r114, %r3, %r49;
mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r608, 7;
mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
mov.u32 %r447, %ctaid.y;
mad.lo.s32 %r448, %r201, %r447, %r8;
add.s32 %r449, %r448, %r84;
mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
// end inline asm
add.s32 %r450, %r449, 4;
mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r423, %r8, 3;
sub.s32 %r117, %r423, %r201;
mov.u32 %r424, %ctaid.y;
mad.lo.s32 %r118, %r201, %r424, %r8;
neg.s32 %r425, %r84;
setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r430, %r118, %r84;
mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
// end inline asm
$L__BB0_222:
mov.u32 %r431, -4;
sub.s32 %r432, %r431, %r84;
setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r437, %r118, %r84;
add.s32 %r438, %r437, 4;
mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
// end inline asm
$L__BB0_226:
shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
mov.u32 %r475, %ctaid.y;
mad.lo.s32 %r476, %r201, %r475, %r8;
add.s32 %r477, %r476, %r119;
mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
// end inline asm
add.s32 %r478, %r477, 4;
mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r451, %r8, 3;
sub.s32 %r120, %r451, %r201;
mov.u32 %r452, %ctaid.y;
mad.lo.s32 %r121, %r201, %r452, %r8;
neg.s32 %r453, %r119;
setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r458, %r121, %r119;
mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
// end inline asm
$L__BB0_229:
mov.u32 %r459, -4;
sub.s32 %r460, %r459, %r119;
setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r465, %r121, %r119;
add.s32 %r466, %r465, 4;
mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
// end inline asm
$L__BB0_233:
mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r479, %r5, %r9;
or.b32 %r481, %r479, %r352;
setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r482, %ctaid.x;
mov.u32 %r483, %ctaid.z;
mov.u32 %r484, %nctaid.x;
mad.lo.s32 %r485, %r483, %r484, %r482;
mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r486, %r11, -1;
setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r610, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r610;
// end inline asm
setp.lt.u32 %p159, %r610, 256;
selp.u32 %r489, 1, 0, %p159;
shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r490, %r11, %r3;
add.s32 %r491, %r490, -1;
div.s32 %r125, %r491, %r3;
setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r493, %r201, 1;
shr.u32 %r494, %r493, 31;
add.s32 %r495, %r493, %r494;
shr.s32 %r496, %r495, 1;
add.s32 %r497, %r4, %r496;
add.s32 %r498, %r497, -1;
shl.b32 %r499, %r9, 1;
shl.b32 %r500, %r4, 1;
mad.lo.s32 %r501, %r500, %r122, %r499;
or.b32 %r502, %r501, 1;
setp.ge.s32 %p162, %r502, %r201;
div.s32 %r503, %r498, %r4;
setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r504, %r4, %r122;
shl.b32 %r505, %r504, 1;
mad.lo.s32 %r506, %r201, %r5, %r505;
add.s32 %r612, %r506, %r499;
mul.lo.s32 %r127, %r201, %r3;
mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r611, %r5;
mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r611, %r11;
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
add.s32 %r612, %r612, %r127;
add.s32 %r611, %r611, %r3;
add.s32 %r613, %r613, 1;
setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r513, %r3;
mov.u32 %r514, 31;
sub.s32 %r515, %r514, %r513;
mov.u32 %r516, 1;
shl.b32 %r138, %r516, %r515;
setp.lt.u32 %p166, %r5, %r138;
add.s32 %r517, %r138, %r5;
setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r518, %r49, %r138;
mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r519, %r138, 31;
add.s32 %r520, %r138, %r519;
shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r616, %r624;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
add.s32 %r521, %r616, %r49;
mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r141, %r616, 1;
setp.gt.u32 %p171, %r616, 3;
mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r522, %r49, 1;
mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r617, %r624;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
add.s32 %r523, %r617, %r49;
mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r143, %r617, 1;
setp.gt.u32 %p177, %r617, 3;
mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r524, %r201, 1;
shr.u32 %r525, %r524, 31;
add.s32 %r526, %r524, %r525;
shr.s32 %r527, %r526, 1;
add.s32 %r528, %r4, %r527;
add.s32 %r529, %r528, -1;
div.s32 %r530, %r529, %r4;
setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
shl.b32 %r144, %r9, 1;
mul.lo.s32 %r531, %r4, %r122;
shl.b32 %r145, %r531, 1;
add.s32 %r532, %r144, %r145;
or.b32 %r533, %r532, 1;
setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r536, %r201, 1;
shr.u32 %r537, %r536, 31;
add.s32 %r538, %r536, %r537;
shr.s32 %r539, %r538, 1;
add.s32 %r540, %r4, %r539;
add.s32 %r541, %r540, -1;
shl.b32 %r542, %r9, 1;
shl.b32 %r543, %r4, 1;
mad.lo.s32 %r544, %r543, %r122, %r542;
or.b32 %r545, %r544, 1;
setp.ge.s32 %p184, %r545, %r201;
div.s32 %r546, %r541, %r4;
setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r547, %r4, %r122;
shl.b32 %r548, %r547, 1;
mad.lo.s32 %r549, %r201, %r5, %r548;
add.s32 %r619, %r549, %r542;
mul.lo.s32 %r147, %r201, %r3;
mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r618, %r5;
mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r618, %r11;
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
add.s32 %r619, %r619, %r147;
add.s32 %r618, %r618, %r3;
add.s32 %r620, %r620, 1;
setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r623, %r624;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
add.s32 %r556, %r623, %r49;
mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r159, %r623, 1;
setp.gt.u32 %p191, %r623, 3;
mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
add.s32 %r557, %r624, %r49;
mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r161, %r624, 1;
setp.gt.u32 %p197, %r624, 3;
mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r558, %r201, 1;
shr.u32 %r559, %r558, 31;
add.s32 %r560, %r558, %r559;
shr.s32 %r561, %r560, 1;
add.s32 %r562, %r4, %r561;
add.s32 %r563, %r562, -1;
div.s32 %r564, %r563, %r4;
setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
shl.b32 %r162, %r9, 1;
mul.lo.s32 %r565, %r4, %r122;
shl.b32 %r163, %r565, 1;
add.s32 %r566, %r162, %r163;
or.b32 %r567, %r566, 1;
setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_41_cu_d846e7fb_723310nvfuser_41ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,173 +32,173 @@
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
- .reg .b32 %r<631>;
+ .reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r200, %r201}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r210, %r211}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r214, %r215}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r237, %r202, 7;
- shr.s32 %r238, %r237, 31;
- shr.u32 %r239, %r238, 29;
- add.s32 %r240, %r237, %r239;
- shr.s32 %r2, %r240, 3;
+ add.s32 %r236, %r201, 7;
+ shr.s32 %r237, %r236, 31;
+ shr.u32 %r238, %r237, 29;
+ add.s32 %r239, %r236, %r238;
+ shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
- mov.u32 %r241, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
+ mov.u32 %r240, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r242, [%rd42], %r5;
+ atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r243, %r4, %r2;
- shl.b32 %r244, %r243, 4;
- or.b32 %r245, %r244, 15;
- and.b32 %r7, %r245, -16;
- add.s32 %r246, %r245, %r7;
- and.b32 %r247, %r246, -16;
- cvt.s64.s32 %rd1, %r247;
- shl.b32 %r248, %r4, 2;
- max.s32 %r249, %r2, %r3;
- mad.lo.s32 %r250, %r248, %r249, 15;
- and.b32 %r251, %r250, -16;
- cvt.u64.u32 %rd2, %r251;
+ mul.lo.s32 %r242, %r4, %r2;
+ shl.b32 %r243, %r242, 4;
+ or.b32 %r244, %r243, 15;
+ and.b32 %r7, %r244, -16;
+ add.s32 %r245, %r244, %r7;
+ and.b32 %r246, %r245, -16;
+ cvt.s64.s32 %rd1, %r246;
+ shl.b32 %r247, %r4, 2;
+ max.s32 %r248, %r2, %r3;
+ mad.lo.s32 %r249, %r247, %r248, 15;
+ and.b32 %r250, %r249, -16;
+ cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r252, %r8, 7;
- setp.lt.s32 %p11, %r252, %r202;
+ or.b32 %r251, %r8, 7;
+ setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
-
-
- shl.b32 %r256, %r5, 4;
- add.s32 %r254, %r253, %r256;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
+
+
+ shl.b32 %r255, %r5, 4;
+ add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
- mov.u32 %r255, 0;
+ mov.u32 %r254, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r255, 0;
- cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r254, 0;
+ cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r583, %r6, 4;
- add.s32 %r257, %r4, 215;
- div.s32 %r258, %r257, %r4;
+ shl.b32 %r577, %r6, 4;
+ add.s32 %r256, %r4, 215;
+ div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r259, %r11, %r258;
- add.s32 %r260, %r259, -1;
- div.s32 %r12, %r260, %r11;
+ add.s32 %r258, %r11, %r257;
+ add.s32 %r259, %r258, -1;
+ div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r202;
+ cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
- mov.u32 %r262, %ctaid.y;
- mul.lo.s32 %r263, %r12, %r4;
- mul.lo.s32 %r13, %r263, %r262;
- shl.b32 %r264, %r9, 1;
- mov.u32 %r265, 1;
- shl.b32 %r266, %r5, 4;
- mad.lo.s32 %r14, %r264, %r202, %r266;
- mul.lo.s32 %r267, %r202, %r9;
- cvt.s64.s32 %rd52, %r267;
+ mov.u32 %r261, %ctaid.y;
+ mul.lo.s32 %r262, %r12, %r4;
+ mul.lo.s32 %r13, %r262, %r261;
+ mad.lo.s32 %r263, %r2, %r9, %r5;
+ shl.b32 %r14, %r263, 4;
+ mul.lo.s32 %r264, %r201, %r9;
+ cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r268, %r13, %r202;
- cvt.s64.s32 %rd6, %r268;
+ mul.lo.s32 %r265, %r13, %r201;
+ cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- mov.u32 %r269, %tid.z;
- mad.lo.s32 %r270, %r4, %r269, %r9;
- mad.lo.s32 %r15, %r270, %r3, %r5;
+ mov.u32 %r266, %tid.z;
+ mad.lo.s32 %r267, %r4, %r266, %r9;
+ mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
- clz.b32 %r271, %r3;
- mov.u32 %r272, 31;
- sub.s32 %r273, %r272, %r271;
- shl.b32 %r16, %r265, %r273;
+ clz.b32 %r268, %r3;
+ mov.u32 %r269, 31;
+ sub.s32 %r270, %r269, %r268;
+ mov.u32 %r271, 1;
+ shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
- add.s32 %r274, %r16, %r5;
- setp.lt.u32 %p18, %r274, %r3;
+ add.s32 %r272, %r16, %r5;
+ setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
- add.s32 %r275, %r15, %r16;
- mul.wide.s32 %rd55, %r275, 4;
+ add.s32 %r273, %r15, %r16;
+ mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
- shr.u32 %r276, %r16, 31;
- add.s32 %r277, %r16, %r276;
- shr.s32 %r17, %r277, 1;
- add.s32 %r18, %r267, %r8;
+ shr.u32 %r274, %r16, 31;
+ add.s32 %r275, %r16, %r274;
+ shr.s32 %r17, %r275, 1;
+ shl.b32 %r276, %r9, 3;
+ mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
- mul.wide.s32 %rd57, %r18, 2;
+ mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
- mul.wide.s32 %rd61, %r270, 4;
+ mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
- mov.u32 %r580, 0;
+ mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
- add.s32 %r282, %r14, %r281;
+ add.s32 %r282, %r281, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
- add.s32 %r285, %r14, %r284;
+ add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
@@ -215,30 +215,30 @@
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r580, %r4;
- add.s32 %r279, %r23, %r9;
- add.s32 %r24, %r279, %r13;
- setp.gt.s32 %p19, %r24, 215;
+ mul.lo.s32 %r22, %r574, %r4;
+ add.s32 %r279, %r22, %r9;
+ add.s32 %r23, %r279, %r13;
+ setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
- mul.lo.s32 %r280, %r24, %r211;
+ mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p20, %r24, 216;
+ setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
- mul.lo.s32 %r578, %r580, %r4;
- mul.lo.s32 %r287, %r578, %r202;
+ mul.lo.s32 %r572, %r574, %r4;
+ mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
@@ -260,19 +260,19 @@
}
$L__BB0_11:
- mul.lo.s32 %r576, %r580, %r4;
- add.s32 %r575, %r576, %r9;
- add.s32 %r574, %r575, %r13;
- setp.gt.s32 %p204, %r574, 215;
+ mul.lo.s32 %r571, %r574, %r4;
+ add.s32 %r570, %r571, %r9;
+ add.s32 %r569, %r570, %r13;
+ setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
- mul.lo.s32 %r288, %r24, %r215;
+ mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
@@ -465,11 +465,11 @@
mov.f32 %f641, %f640;
$L__BB0_16:
- shl.b32 %r583, %r583, 2;
+ shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
@@ -480,29 +480,29 @@
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
- mov.u32 %r581, %r17;
+ mov.u32 %r575, %r17;
$L__BB0_20:
- setp.ge.u32 %p26, %r5, %r581;
+ setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
- add.s32 %r317, %r581, %r15;
+ add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
- shr.u32 %r36, %r581, 1;
- setp.gt.u32 %p27, %r581, 3;
- mov.u32 %r581, %r36;
+ shr.u32 %r35, %r575, 1;
+ setp.gt.u32 %p27, %r575, 3;
+ mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
@@ -529,29 +529,29 @@
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
- mov.u32 %r582, %r17;
+ mov.u32 %r576, %r17;
$L__BB0_30:
- setp.ge.u32 %p32, %r5, %r582;
+ setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
- add.s32 %r318, %r582, %r15;
+ add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
- shr.u32 %r38, %r582, 1;
- setp.gt.u32 %p33, %r582, 3;
- mov.u32 %r582, %r38;
+ shr.u32 %r37, %r576, 1;
+ setp.gt.u32 %p33, %r576, 3;
+ mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
@@ -589,11 +589,10 @@
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
- mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
@@ -801,13 +800,12 @@
{ cvt.rn.bf16.f32 %rs124, %f365;}
mov.b32 %r326, {%rs124, %rs128};
- add.s32 %r351, %r13, %r577;
- mad.lo.s32 %r352, %r351, %r202, %r18;
- mul.wide.s32 %rd82, %r352, 2;
+ mad.lo.s32 %r351, %r23, %r201, %r8;
+ mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
bra.uni $L__BB0_43;
@@ -817,12 +815,12 @@
{ cvt.rn.bf16.f32 %rs61, %f301;}
$L__BB0_43:
- add.s32 %r580, %r580, 1;
- setp.lt.s32 %p39, %r580, %r12;
+ add.s32 %r574, %r574, 1;
+ setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
@@ -841,68 +839,68 @@
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
- mov.u32 %r353, %tid.z;
- mad.lo.s32 %r354, %r4, %r353, %r9;
- mad.lo.s32 %r50, %r354, %r3, %r5;
- mul.wide.u32 %rd83, %r50, 4;
+ mov.u32 %r352, %tid.z;
+ mad.lo.s32 %r353, %r4, %r352, %r9;
+ mad.lo.s32 %r49, %r353, %r3, %r5;
+ mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
- clz.b32 %r355, %r4;
- mov.u32 %r356, 31;
- sub.s32 %r51, %r356, %r355;
- mov.u32 %r357, 1;
- shl.b32 %r614, %r357, %r51;
- setp.lt.u32 %p40, %r9, %r614;
- add.s32 %r358, %r614, %r9;
- setp.lt.u32 %p41, %r358, %r4;
+ clz.b32 %r354, %r4;
+ mov.u32 %r355, 31;
+ sub.s32 %r50, %r355, %r354;
+ mov.u32 %r356, 1;
+ shl.b32 %r608, %r356, %r50;
+ setp.lt.u32 %p40, %r9, %r608;
+ add.s32 %r357, %r608, %r9;
+ setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
- shl.b32 %r359, %r3, %r51;
- add.s32 %r360, %r50, %r359;
- mul.wide.s32 %rd85, %r360, 4;
+ shl.b32 %r358, %r3, %r50;
+ add.s32 %r359, %r49, %r358;
+ mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
- setp.lt.s32 %p43, %r614, 4;
+ setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
- mov.u32 %r584, %r614;
+ mov.u32 %r578, %r608;
$L__BB0_48:
- shr.u32 %r54, %r584, 1;
- setp.ge.u32 %p44, %r9, %r54;
+ shr.u32 %r53, %r578, 1;
+ setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
- mad.lo.s32 %r361, %r54, %r3, %r50;
- mul.wide.s32 %rd88, %r361, 4;
+ mad.lo.s32 %r360, %r53, %r3, %r49;
+ mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
- setp.gt.u32 %p45, %r584, 7;
- mov.u32 %r584, %r54;
+ setp.gt.u32 %p45, %r578, 7;
+ mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
- mov.u32 %r585, 0;
- add.s32 %r363, %r50, %r3;
- mul.wide.u32 %rd91, %r363, 4;
+ mov.u32 %r579, 0;
+ add.s32 %r362, %r49, %r3;
+ mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
@@ -911,54 +909,54 @@
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
- mov.b32 %r585, %f660;
+ mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
- shl.b32 %r364, %r3, %r51;
- add.s32 %r365, %r50, %r364;
- mul.wide.s32 %rd93, %r365, 4;
+ shl.b32 %r363, %r3, %r50;
+ add.s32 %r364, %r49, %r363;
+ mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
- mov.u32 %r586, %r614;
+ mov.u32 %r580, %r608;
$L__BB0_59:
- shr.u32 %r58, %r586, 1;
- setp.ge.u32 %p50, %r9, %r58;
+ shr.u32 %r57, %r580, 1;
+ setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
- mad.lo.s32 %r366, %r58, %r3, %r50;
- mul.wide.s32 %rd96, %r366, 4;
+ mad.lo.s32 %r365, %r57, %r3, %r49;
+ mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
- setp.gt.u32 %p51, %r586, 7;
- mov.u32 %r586, %r58;
+ setp.gt.u32 %p51, %r580, 7;
+ mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
- mov.u32 %r587, 0;
+ mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@@ -966,54 +964,54 @@
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
- mov.b32 %r587, %f661;
+ mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
- shl.b32 %r368, %r3, %r51;
- add.s32 %r369, %r50, %r368;
- mul.wide.s32 %rd99, %r369, 4;
+ shl.b32 %r367, %r3, %r50;
+ add.s32 %r368, %r49, %r367;
+ mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
- mov.u32 %r588, %r614;
+ mov.u32 %r582, %r608;
$L__BB0_70:
- shr.u32 %r62, %r588, 1;
- setp.ge.u32 %p56, %r9, %r62;
+ shr.u32 %r61, %r582, 1;
+ setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
- mad.lo.s32 %r370, %r62, %r3, %r50;
- mul.wide.s32 %rd102, %r370, 4;
+ mad.lo.s32 %r369, %r61, %r3, %r49;
+ mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
- setp.gt.u32 %p57, %r588, 7;
- mov.u32 %r588, %r62;
+ setp.gt.u32 %p57, %r582, 7;
+ mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
- mov.u32 %r589, 0;
+ mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@@ -1021,54 +1019,54 @@
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
- mov.b32 %r589, %f662;
+ mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
- shl.b32 %r372, %r3, %r51;
- add.s32 %r373, %r50, %r372;
- mul.wide.s32 %rd105, %r373, 4;
+ shl.b32 %r371, %r3, %r50;
+ add.s32 %r372, %r49, %r371;
+ mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
- mov.u32 %r590, %r614;
+ mov.u32 %r584, %r608;
$L__BB0_81:
- shr.u32 %r66, %r590, 1;
- setp.ge.u32 %p62, %r9, %r66;
+ shr.u32 %r65, %r584, 1;
+ setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
- mad.lo.s32 %r374, %r66, %r3, %r50;
- mul.wide.s32 %rd108, %r374, 4;
+ mad.lo.s32 %r373, %r65, %r3, %r49;
+ mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
- setp.gt.u32 %p63, %r590, 7;
- mov.u32 %r590, %r66;
+ setp.gt.u32 %p63, %r584, 7;
+ mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
- mov.u32 %r591, 0;
+ mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@@ -1076,54 +1074,54 @@
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
- mov.b32 %r591, %f663;
+ mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
- shl.b32 %r376, %r3, %r51;
- add.s32 %r377, %r50, %r376;
- mul.wide.s32 %rd111, %r377, 4;
+ shl.b32 %r375, %r3, %r50;
+ add.s32 %r376, %r49, %r375;
+ mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
- mov.u32 %r592, %r614;
+ mov.u32 %r586, %r608;
$L__BB0_92:
- shr.u32 %r70, %r592, 1;
- setp.ge.u32 %p68, %r9, %r70;
+ shr.u32 %r69, %r586, 1;
+ setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
- mad.lo.s32 %r378, %r70, %r3, %r50;
- mul.wide.s32 %rd114, %r378, 4;
+ mad.lo.s32 %r377, %r69, %r3, %r49;
+ mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
- setp.gt.u32 %p69, %r592, 7;
- mov.u32 %r592, %r70;
+ setp.gt.u32 %p69, %r586, 7;
+ mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
- mov.u32 %r593, 0;
+ mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@@ -1131,54 +1129,54 @@
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
- mov.b32 %r593, %f664;
+ mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
- shl.b32 %r380, %r3, %r51;
- add.s32 %r381, %r50, %r380;
- mul.wide.s32 %rd117, %r381, 4;
+ shl.b32 %r379, %r3, %r50;
+ add.s32 %r380, %r49, %r379;
+ mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
- mov.u32 %r594, %r614;
+ mov.u32 %r588, %r608;
$L__BB0_103:
- shr.u32 %r74, %r594, 1;
- setp.ge.u32 %p74, %r9, %r74;
+ shr.u32 %r73, %r588, 1;
+ setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
- mad.lo.s32 %r382, %r74, %r3, %r50;
- mul.wide.s32 %rd120, %r382, 4;
+ mad.lo.s32 %r381, %r73, %r3, %r49;
+ mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
- setp.gt.u32 %p75, %r594, 7;
- mov.u32 %r594, %r74;
+ setp.gt.u32 %p75, %r588, 7;
+ mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
- mov.u32 %r595, 0;
+ mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@@ -1186,54 +1184,54 @@
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
- mov.b32 %r595, %f665;
+ mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
- shl.b32 %r384, %r3, %r51;
- add.s32 %r385, %r50, %r384;
- mul.wide.s32 %rd123, %r385, 4;
+ shl.b32 %r383, %r3, %r50;
+ add.s32 %r384, %r49, %r383;
+ mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
- mov.u32 %r596, %r614;
+ mov.u32 %r590, %r608;
$L__BB0_114:
- shr.u32 %r78, %r596, 1;
- setp.ge.u32 %p80, %r9, %r78;
+ shr.u32 %r77, %r590, 1;
+ setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
- mad.lo.s32 %r386, %r78, %r3, %r50;
- mul.wide.s32 %rd126, %r386, 4;
+ mad.lo.s32 %r385, %r77, %r3, %r49;
+ mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
- setp.gt.u32 %p81, %r596, 7;
- mov.u32 %r596, %r78;
+ setp.gt.u32 %p81, %r590, 7;
+ mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
- mov.u32 %r597, 0;
+ mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@@ -1241,54 +1239,54 @@
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
- mov.b32 %r597, %f666;
+ mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
- shl.b32 %r388, %r3, %r51;
- add.s32 %r389, %r50, %r388;
- mul.wide.s32 %rd129, %r389, 4;
+ shl.b32 %r387, %r3, %r50;
+ add.s32 %r388, %r49, %r387;
+ mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
- mov.u32 %r598, %r614;
+ mov.u32 %r592, %r608;
$L__BB0_125:
- shr.u32 %r82, %r598, 1;
- setp.ge.u32 %p86, %r9, %r82;
+ shr.u32 %r81, %r592, 1;
+ setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
- mad.lo.s32 %r390, %r82, %r3, %r50;
- mul.wide.s32 %rd132, %r390, 4;
+ mad.lo.s32 %r389, %r81, %r3, %r49;
+ mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
- setp.gt.u32 %p87, %r598, 7;
- mov.u32 %r598, %r82;
+ setp.gt.u32 %p87, %r592, 7;
+ mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
- mov.u32 %r599, 0;
+ mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@@ -1296,55 +1294,55 @@
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
- mov.b32 %r599, %f667;
+ mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
- shl.b32 %r85, %r583, 4;
+ shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
- shl.b32 %r392, %r3, %r51;
- add.s32 %r393, %r50, %r392;
- mul.wide.s32 %rd135, %r393, 4;
+ shl.b32 %r391, %r3, %r50;
+ add.s32 %r392, %r49, %r391;
+ mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
- mov.u32 %r600, %r614;
+ mov.u32 %r594, %r608;
$L__BB0_136:
- shr.u32 %r87, %r600, 1;
- setp.ge.u32 %p92, %r9, %r87;
+ shr.u32 %r86, %r594, 1;
+ setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
- mad.lo.s32 %r394, %r87, %r3, %r50;
- mul.wide.s32 %rd138, %r394, 4;
+ mad.lo.s32 %r393, %r86, %r3, %r49;
+ mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
- setp.gt.u32 %p93, %r600, 7;
- mov.u32 %r600, %r87;
+ setp.gt.u32 %p93, %r594, 7;
+ mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
- mov.u32 %r601, 0;
+ mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@@ -1352,54 +1350,54 @@
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
- mov.b32 %r601, %f668;
+ mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
- shl.b32 %r396, %r3, %r51;
- add.s32 %r397, %r50, %r396;
- mul.wide.s32 %rd141, %r397, 4;
+ shl.b32 %r395, %r3, %r50;
+ add.s32 %r396, %r49, %r395;
+ mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
- mov.u32 %r602, %r614;
+ mov.u32 %r596, %r608;
$L__BB0_147:
- shr.u32 %r91, %r602, 1;
- setp.ge.u32 %p98, %r9, %r91;
+ shr.u32 %r90, %r596, 1;
+ setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
- mad.lo.s32 %r398, %r91, %r3, %r50;
- mul.wide.s32 %rd144, %r398, 4;
+ mad.lo.s32 %r397, %r90, %r3, %r49;
+ mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
- setp.gt.u32 %p99, %r602, 7;
- mov.u32 %r602, %r91;
+ setp.gt.u32 %p99, %r596, 7;
+ mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
- mov.u32 %r603, 0;
+ mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@@ -1407,54 +1405,54 @@
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
- mov.b32 %r603, %f669;
+ mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
- shl.b32 %r400, %r3, %r51;
- add.s32 %r401, %r50, %r400;
- mul.wide.s32 %rd147, %r401, 4;
+ shl.b32 %r399, %r3, %r50;
+ add.s32 %r400, %r49, %r399;
+ mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
- mov.u32 %r604, %r614;
+ mov.u32 %r598, %r608;
$L__BB0_158:
- shr.u32 %r95, %r604, 1;
- setp.ge.u32 %p104, %r9, %r95;
+ shr.u32 %r94, %r598, 1;
+ setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
- mad.lo.s32 %r402, %r95, %r3, %r50;
- mul.wide.s32 %rd150, %r402, 4;
+ mad.lo.s32 %r401, %r94, %r3, %r49;
+ mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
- setp.gt.u32 %p105, %r604, 7;
- mov.u32 %r604, %r95;
+ setp.gt.u32 %p105, %r598, 7;
+ mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
- mov.u32 %r605, 0;
+ mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@@ -1462,54 +1460,54 @@
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
- mov.b32 %r605, %f670;
+ mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
- shl.b32 %r404, %r3, %r51;
- add.s32 %r405, %r50, %r404;
- mul.wide.s32 %rd153, %r405, 4;
+ shl.b32 %r403, %r3, %r50;
+ add.s32 %r404, %r49, %r403;
+ mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
- mov.u32 %r606, %r614;
+ mov.u32 %r600, %r608;
$L__BB0_169:
- shr.u32 %r99, %r606, 1;
- setp.ge.u32 %p110, %r9, %r99;
+ shr.u32 %r98, %r600, 1;
+ setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
- mad.lo.s32 %r406, %r99, %r3, %r50;
- mul.wide.s32 %rd156, %r406, 4;
+ mad.lo.s32 %r405, %r98, %r3, %r49;
+ mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
- setp.gt.u32 %p111, %r606, 7;
- mov.u32 %r606, %r99;
+ setp.gt.u32 %p111, %r600, 7;
+ mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
- mov.u32 %r607, 0;
+ mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@@ -1517,54 +1515,54 @@
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
- mov.b32 %r607, %f671;
+ mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
- shl.b32 %r408, %r3, %r51;
- add.s32 %r409, %r50, %r408;
- mul.wide.s32 %rd159, %r409, 4;
+ shl.b32 %r407, %r3, %r50;
+ add.s32 %r408, %r49, %r407;
+ mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
- mov.u32 %r608, %r614;
+ mov.u32 %r602, %r608;
$L__BB0_180:
- shr.u32 %r103, %r608, 1;
- setp.ge.u32 %p116, %r9, %r103;
+ shr.u32 %r102, %r602, 1;
+ setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
- mad.lo.s32 %r410, %r103, %r3, %r50;
- mul.wide.s32 %rd162, %r410, 4;
+ mad.lo.s32 %r409, %r102, %r3, %r49;
+ mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
- setp.gt.u32 %p117, %r608, 7;
- mov.u32 %r608, %r103;
+ setp.gt.u32 %p117, %r602, 7;
+ mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
- mov.u32 %r609, 0;
+ mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@@ -1572,54 +1570,54 @@
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
- mov.b32 %r609, %f672;
+ mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
- shl.b32 %r412, %r3, %r51;
- add.s32 %r413, %r50, %r412;
- mul.wide.s32 %rd165, %r413, 4;
+ shl.b32 %r411, %r3, %r50;
+ add.s32 %r412, %r49, %r411;
+ mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
- mov.u32 %r610, %r614;
+ mov.u32 %r604, %r608;
$L__BB0_191:
- shr.u32 %r107, %r610, 1;
- setp.ge.u32 %p122, %r9, %r107;
+ shr.u32 %r106, %r604, 1;
+ setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
- mad.lo.s32 %r414, %r107, %r3, %r50;
- mul.wide.s32 %rd168, %r414, 4;
+ mad.lo.s32 %r413, %r106, %r3, %r49;
+ mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
- setp.gt.u32 %p123, %r610, 7;
- mov.u32 %r610, %r107;
+ setp.gt.u32 %p123, %r604, 7;
+ mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
- mov.u32 %r611, 0;
+ mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@@ -1627,54 +1625,54 @@
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
- mov.b32 %r611, %f673;
+ mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
- shl.b32 %r416, %r3, %r51;
- add.s32 %r417, %r50, %r416;
- mul.wide.s32 %rd171, %r417, 4;
+ shl.b32 %r415, %r3, %r50;
+ add.s32 %r416, %r49, %r415;
+ mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
- mov.u32 %r612, %r614;
+ mov.u32 %r606, %r608;
$L__BB0_202:
- shr.u32 %r111, %r612, 1;
- setp.ge.u32 %p128, %r9, %r111;
+ shr.u32 %r110, %r606, 1;
+ setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
- mad.lo.s32 %r418, %r111, %r3, %r50;
- mul.wide.s32 %rd174, %r418, 4;
+ mad.lo.s32 %r417, %r110, %r3, %r49;
+ mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
- setp.gt.u32 %p129, %r612, 7;
- mov.u32 %r612, %r111;
+ setp.gt.u32 %p129, %r606, 7;
+ mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
- mov.u32 %r613, 0;
+ mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@@ -1682,21 +1680,21 @@
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
- mov.b32 %r613, %f674;
+ mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
- shl.b32 %r420, %r3, %r51;
- add.s32 %r421, %r50, %r420;
- mul.wide.s32 %rd177, %r421, 4;
+ shl.b32 %r419, %r3, %r50;
+ add.s32 %r420, %r49, %r419;
+ mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
@@ -1704,30 +1702,30 @@
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
- shr.u32 %r115, %r614, 1;
- setp.ge.u32 %p134, %r9, %r115;
+ shr.u32 %r114, %r608, 1;
+ setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
- mad.lo.s32 %r422, %r115, %r3, %r50;
- mul.wide.s32 %rd180, %r422, 4;
+ mad.lo.s32 %r421, %r114, %r3, %r49;
+ mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
- setp.gt.u32 %p135, %r614, 7;
- mov.u32 %r614, %r115;
+ setp.gt.u32 %p135, %r608, 7;
+ mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
- mov.u32 %r615, 0;
+ mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@@ -1735,255 +1733,251 @@
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
- mov.b32 %r615, %f675;
+ mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
- shl.b32 %r573, %r5, 3;
- mov.u32 %r448, %ctaid.y;
- mad.lo.s32 %r449, %r202, %r448, %r573;
- add.s32 %r450, %r449, %r85;
- mul.wide.s32 %rd189, %r450, 4;
+ mov.u32 %r447, %ctaid.y;
+ mad.lo.s32 %r448, %r201, %r447, %r8;
+ add.s32 %r449, %r448, %r84;
+ mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
- st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
-
- add.s32 %r451, %r450, 4;
- mul.wide.s32 %rd190, %r451, 4;
+ st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
+
+ add.s32 %r450, %r449, 4;
+ mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
- st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
bra.uni $L__BB0_226;
$L__BB0_220:
- shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
- add.s32 %r424, %r570, 3;
- sub.s32 %r118, %r424, %r202;
- mov.u32 %r425, %ctaid.y;
- mad.lo.s32 %r119, %r202, %r425, %r570;
- neg.s32 %r426, %r85;
- setp.ge.s32 %p141, %r118, %r426;
+ add.s32 %r423, %r8, 3;
+ sub.s32 %r117, %r423, %r201;
+ mov.u32 %r424, %ctaid.y;
+ mad.lo.s32 %r118, %r201, %r424, %r8;
+ neg.s32 %r425, %r84;
+ setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
- add.s32 %r431, %r119, %r85;
- mul.wide.s32 %rd184, %r431, 4;
+ add.s32 %r430, %r118, %r84;
+ mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
- st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
+ st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
$L__BB0_222:
- mov.u32 %r432, -4;
- sub.s32 %r433, %r432, %r85;
- setp.ge.s32 %p143, %r118, %r433;
+ mov.u32 %r431, -4;
+ sub.s32 %r432, %r431, %r84;
+ setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
- add.s32 %r438, %r119, %r85;
- add.s32 %r439, %r438, 4;
- mul.wide.s32 %rd186, %r439, 4;
+ add.s32 %r437, %r118, %r84;
+ add.s32 %r438, %r437, 4;
+ mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
$L__BB0_226:
- shl.b32 %r120, %r583, 5;
+ shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
- shl.b32 %r572, %r5, 3;
- mov.u32 %r476, %ctaid.y;
- mad.lo.s32 %r477, %r202, %r476, %r572;
- add.s32 %r478, %r477, %r120;
- mul.wide.s32 %rd197, %r478, 4;
+ mov.u32 %r475, %ctaid.y;
+ mad.lo.s32 %r476, %r201, %r475, %r8;
+ add.s32 %r477, %r476, %r119;
+ mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
- st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
-
- add.s32 %r479, %r478, 4;
- mul.wide.s32 %rd198, %r479, 4;
+ st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
+
+ add.s32 %r478, %r477, 4;
+ mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
- st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
bra.uni $L__BB0_233;
$L__BB0_227:
- shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
- add.s32 %r452, %r571, 3;
- sub.s32 %r121, %r452, %r202;
- mov.u32 %r453, %ctaid.y;
- mad.lo.s32 %r122, %r202, %r453, %r571;
- neg.s32 %r454, %r120;
- setp.ge.s32 %p150, %r121, %r454;
+ add.s32 %r451, %r8, 3;
+ sub.s32 %r120, %r451, %r201;
+ mov.u32 %r452, %ctaid.y;
+ mad.lo.s32 %r121, %r201, %r452, %r8;
+ neg.s32 %r453, %r119;
+ setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
- add.s32 %r459, %r122, %r120;
- mul.wide.s32 %rd192, %r459, 4;
+ add.s32 %r458, %r121, %r119;
+ mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
- st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
+ st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
$L__BB0_229:
- mov.u32 %r460, -4;
- sub.s32 %r461, %r460, %r120;
- setp.ge.s32 %p152, %r121, %r461;
+ mov.u32 %r459, -4;
+ sub.s32 %r460, %r459, %r119;
+ setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
- add.s32 %r466, %r122, %r120;
- add.s32 %r467, %r466, 4;
- mul.wide.s32 %rd194, %r467, 4;
+ add.s32 %r465, %r121, %r119;
+ add.s32 %r466, %r465, 4;
+ mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
- st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
$L__BB0_233:
- mov.u32 %r123, %ctaid.y;
+ mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r480, %r5, %r9;
- or.b32 %r482, %r480, %r353;
- setp.ne.s32 %p156, %r482, 0;
+ or.b32 %r479, %r5, %r9;
+ or.b32 %r481, %r479, %r352;
+ setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
- mov.u32 %r483, %ctaid.x;
- mov.u32 %r484, %ctaid.z;
- mov.u32 %r485, %nctaid.x;
- mad.lo.s32 %r486, %r484, %r485, %r483;
- mul.wide.s32 %rd200, %r486, 8;
+ mov.u32 %r482, %ctaid.x;
+ mov.u32 %r483, %ctaid.z;
+ mov.u32 %r484, %nctaid.x;
+ mad.lo.s32 %r485, %r483, %r484, %r482;
+ mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
- add.s32 %r487, %r11, -1;
- setp.eq.s32 %p157, %r123, %r487;
+ add.s32 %r486, %r11, -1;
+ setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
- mov.u32 %r616, 8;
+ mov.u32 %r610, 8;
$L__BB0_236:
- nanosleep.u32 %r616;
-
- setp.lt.u32 %p159, %r616, 256;
- selp.u32 %r490, 1, 0, %p159;
- shl.b32 %r616, %r616, %r490;
+ nanosleep.u32 %r610;
+
+ setp.lt.u32 %p159, %r610, 256;
+ selp.u32 %r489, 1, 0, %p159;
+ shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
- add.s32 %r491, %r11, %r3;
- add.s32 %r492, %r491, -1;
- div.s32 %r126, %r492, %r3;
- setp.lt.s32 %p161, %r126, 1;
+ add.s32 %r490, %r11, %r3;
+ add.s32 %r491, %r490, -1;
+ div.s32 %r125, %r491, %r3;
+ setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
- add.s32 %r494, %r202, 1;
- shr.u32 %r495, %r494, 31;
- add.s32 %r496, %r494, %r495;
- shr.s32 %r497, %r496, 1;
- add.s32 %r498, %r4, %r497;
- add.s32 %r499, %r498, -1;
- shl.b32 %r500, %r9, 1;
- shl.b32 %r501, %r4, 1;
- mad.lo.s32 %r502, %r501, %r123, %r500;
- or.b32 %r503, %r502, 1;
- setp.ge.s32 %p162, %r503, %r202;
- div.s32 %r504, %r499, %r4;
- setp.ge.s32 %p163, %r123, %r504;
+ add.s32 %r493, %r201, 1;
+ shr.u32 %r494, %r493, 31;
+ add.s32 %r495, %r493, %r494;
+ shr.s32 %r496, %r495, 1;
+ add.s32 %r497, %r4, %r496;
+ add.s32 %r498, %r497, -1;
+ shl.b32 %r499, %r9, 1;
+ shl.b32 %r500, %r4, 1;
+ mad.lo.s32 %r501, %r500, %r122, %r499;
+ or.b32 %r502, %r501, 1;
+ setp.ge.s32 %p162, %r502, %r201;
+ div.s32 %r503, %r498, %r4;
+ setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
- mul.lo.s32 %r505, %r4, %r123;
- shl.b32 %r506, %r505, 1;
- mad.lo.s32 %r507, %r202, %r5, %r506;
- add.s32 %r618, %r507, %r500;
- mul.lo.s32 %r128, %r202, %r3;
- mov.u32 %r493, 0;
+ mul.lo.s32 %r504, %r4, %r122;
+ shl.b32 %r505, %r504, 1;
+ mad.lo.s32 %r506, %r201, %r5, %r505;
+ add.s32 %r612, %r506, %r499;
+ mul.lo.s32 %r127, %r201, %r3;
+ mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
- mov.u32 %r617, %r5;
- mov.u32 %r619, %r493;
+ mov.u32 %r611, %r5;
+ mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
- setp.ge.s32 %p164, %r617, %r11;
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ setp.ge.s32 %p164, %r611, %r11;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
- mul.wide.s32 %rd210, %r618, 4;
+ mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
- ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
+ ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
$L__BB0_242:
- mov.b32 %f558, %r621;
+ mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
- mov.b32 %f559, %r620;
+ mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
- add.s32 %r618, %r618, %r128;
- add.s32 %r617, %r617, %r3;
- add.s32 %r619, %r619, 1;
- setp.lt.s32 %p165, %r619, %r126;
+ add.s32 %r612, %r612, %r127;
+ add.s32 %r611, %r611, %r3;
+ add.s32 %r613, %r613, 1;
+ setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
- clz.b32 %r514, %r3;
- mov.u32 %r515, 31;
- sub.s32 %r516, %r515, %r514;
- mov.u32 %r517, 1;
- shl.b32 %r139, %r517, %r516;
- setp.lt.u32 %p166, %r5, %r139;
- add.s32 %r518, %r139, %r5;
- setp.lt.u32 %p167, %r518, %r3;
+ clz.b32 %r513, %r3;
+ mov.u32 %r514, 31;
+ sub.s32 %r515, %r514, %r513;
+ mov.u32 %r516, 1;
+ shl.b32 %r138, %r516, %r515;
+ setp.lt.u32 %p166, %r5, %r138;
+ add.s32 %r517, %r138, %r5;
+ setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
- add.s32 %r519, %r50, %r139;
- mul.wide.s32 %rd211, %r519, 4;
+ add.s32 %r518, %r49, %r138;
+ mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
- shr.u32 %r520, %r139, 31;
- add.s32 %r521, %r139, %r520;
- shr.s32 %r630, %r521, 1;
+ shr.u32 %r519, %r138, 31;
+ add.s32 %r520, %r138, %r519;
+ shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
@@ -1991,38 +1985,38 @@
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
- setp.lt.s32 %p169, %r139, 4;
+ setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
- mov.u32 %r622, %r630;
+ mov.u32 %r616, %r624;
$L__BB0_247:
- setp.ge.u32 %p170, %r5, %r622;
+ setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
- add.s32 %r522, %r622, %r50;
- mul.wide.s32 %rd213, %r522, 4;
+ add.s32 %r521, %r616, %r49;
+ mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
- shr.u32 %r142, %r622, 1;
- setp.gt.u32 %p171, %r622, 3;
- mov.u32 %r622, %r142;
+ shr.u32 %r141, %r616, 1;
+ setp.gt.u32 %p171, %r616, 3;
+ mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
- add.s32 %r523, %r50, 1;
- mul.wide.u32 %rd216, %r523, 4;
+ add.s32 %r522, %r49, 1;
+ mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
@@ -2050,29 +2044,29 @@
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
- mov.u32 %r623, %r630;
+ mov.u32 %r617, %r624;
$L__BB0_257:
- setp.ge.u32 %p176, %r5, %r623;
+ setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
- add.s32 %r524, %r623, %r50;
- mul.wide.s32 %rd218, %r524, 4;
+ add.s32 %r523, %r617, %r49;
+ mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
- shr.u32 %r144, %r623, 1;
- setp.gt.u32 %p177, %r623, 3;
- mov.u32 %r623, %r144;
+ shr.u32 %r143, %r617, 1;
+ setp.gt.u32 %p177, %r617, 3;
+ mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
@@ -2091,90 +2085,90 @@
{ cvt.rn.bf16.f32 %rs130, %f681;}
@%p10 bra $L__BB0_267;
- add.s32 %r525, %r202, 1;
- shr.u32 %r526, %r525, 31;
- add.s32 %r527, %r525, %r526;
- shr.s32 %r528, %r527, 1;
- add.s32 %r529, %r4, %r528;
- add.s32 %r530, %r529, -1;
- div.s32 %r531, %r530, %r4;
- setp.ge.s32 %p181, %r123, %r531;
+ add.s32 %r524, %r201, 1;
+ shr.u32 %r525, %r524, 31;
+ add.s32 %r526, %r524, %r525;
+ shr.s32 %r527, %r526, 1;
+ add.s32 %r528, %r4, %r527;
+ add.s32 %r529, %r528, -1;
+ div.s32 %r530, %r529, %r4;
+ setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
- shl.b32 %r145, %r9, 1;
- mul.lo.s32 %r532, %r4, %r123;
- shl.b32 %r146, %r532, 1;
- add.s32 %r533, %r145, %r146;
- or.b32 %r534, %r533, 1;
- setp.ge.s32 %p182, %r534, %r202;
+ shl.b32 %r144, %r9, 1;
+ mul.lo.s32 %r531, %r4, %r122;
+ shl.b32 %r145, %r531, 1;
+ add.s32 %r532, %r144, %r145;
+ or.b32 %r533, %r532, 1;
+ setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r535, %r146, %r145;
+ add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
- mul.wide.s32 %rd222, %r535, 2;
+ mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
- add.s32 %r537, %r202, 1;
- shr.u32 %r538, %r537, 31;
- add.s32 %r539, %r537, %r538;
- shr.s32 %r540, %r539, 1;
- add.s32 %r541, %r4, %r540;
- add.s32 %r542, %r541, -1;
- shl.b32 %r543, %r9, 1;
- shl.b32 %r544, %r4, 1;
- mad.lo.s32 %r545, %r544, %r123, %r543;
- or.b32 %r546, %r545, 1;
- setp.ge.s32 %p184, %r546, %r202;
- div.s32 %r547, %r542, %r4;
- setp.ge.s32 %p185, %r123, %r547;
+ add.s32 %r536, %r201, 1;
+ shr.u32 %r537, %r536, 31;
+ add.s32 %r538, %r536, %r537;
+ shr.s32 %r539, %r538, 1;
+ add.s32 %r540, %r4, %r539;
+ add.s32 %r541, %r540, -1;
+ shl.b32 %r542, %r9, 1;
+ shl.b32 %r543, %r4, 1;
+ mad.lo.s32 %r544, %r543, %r122, %r542;
+ or.b32 %r545, %r544, 1;
+ setp.ge.s32 %p184, %r545, %r201;
+ div.s32 %r546, %r541, %r4;
+ setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
- mul.lo.s32 %r548, %r4, %r123;
- shl.b32 %r549, %r548, 1;
- mad.lo.s32 %r550, %r202, %r5, %r549;
- add.s32 %r625, %r550, %r543;
- mul.lo.s32 %r148, %r202, %r3;
- mov.u32 %r536, 0;
+ mul.lo.s32 %r547, %r4, %r122;
+ shl.b32 %r548, %r547, 1;
+ mad.lo.s32 %r549, %r201, %r5, %r548;
+ add.s32 %r619, %r549, %r542;
+ mul.lo.s32 %r147, %r201, %r3;
+ mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
- mov.u32 %r624, %r5;
- mov.u32 %r626, %r536;
+ mov.u32 %r618, %r5;
+ mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
- setp.ge.s32 %p186, %r624, %r11;
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ setp.ge.s32 %p186, %r618, %r11;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
- mul.wide.s32 %rd225, %r625, 4;
+ mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
- ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
+ ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
$L__BB0_272:
- mov.b32 %f584, %r628;
+ mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
- mov.b32 %f585, %r627;
+ mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
- add.s32 %r625, %r625, %r148;
- add.s32 %r624, %r624, %r3;
- add.s32 %r626, %r626, 1;
- setp.lt.s32 %p187, %r626, %r126;
+ add.s32 %r619, %r619, %r147;
+ add.s32 %r618, %r618, %r3;
+ add.s32 %r620, %r620, 1;
+ setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@@ -2187,29 +2181,29 @@
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
- mov.u32 %r629, %r630;
+ mov.u32 %r623, %r624;
$L__BB0_277:
- setp.ge.u32 %p190, %r5, %r629;
+ setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
- add.s32 %r557, %r629, %r50;
- mul.wide.s32 %rd226, %r557, 4;
+ add.s32 %r556, %r623, %r49;
+ mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
- shr.u32 %r160, %r629, 1;
- setp.gt.u32 %p191, %r629, 3;
- mov.u32 %r629, %r160;
+ shr.u32 %r159, %r623, 1;
+ setp.gt.u32 %p191, %r623, 3;
+ mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
@@ -2240,26 +2234,26 @@
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
- setp.ge.u32 %p196, %r5, %r630;
+ setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
- add.s32 %r558, %r630, %r50;
- mul.wide.s32 %rd229, %r558, 4;
+ add.s32 %r557, %r624, %r49;
+ mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
- shr.u32 %r162, %r630, 1;
- setp.gt.u32 %p197, %r630, 3;
- mov.u32 %r630, %r162;
+ shr.u32 %r161, %r624, 1;
+ setp.gt.u32 %p197, %r624, 3;
+ mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
@@ -2278,32 +2272,32 @@
{ cvt.rn.bf16.f32 %rs132, %f687;}
@%p10 bra $L__BB0_296;
- add.s32 %r559, %r202, 1;
- shr.u32 %r560, %r559, 31;
- add.s32 %r561, %r559, %r560;
- shr.s32 %r562, %r561, 1;
- add.s32 %r563, %r4, %r562;
- add.s32 %r564, %r563, -1;
- div.s32 %r565, %r564, %r4;
- setp.ge.s32 %p201, %r123, %r565;
+ add.s32 %r558, %r201, 1;
+ shr.u32 %r559, %r558, 31;
+ add.s32 %r560, %r558, %r559;
+ shr.s32 %r561, %r560, 1;
+ add.s32 %r562, %r4, %r561;
+ add.s32 %r563, %r562, -1;
+ div.s32 %r564, %r563, %r4;
+ setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
- shl.b32 %r163, %r9, 1;
- mul.lo.s32 %r566, %r4, %r123;
- shl.b32 %r164, %r566, 1;
- add.s32 %r567, %r163, %r164;
- or.b32 %r568, %r567, 1;
- setp.ge.s32 %p202, %r568, %r202;
+ shl.b32 %r162, %r9, 1;
+ mul.lo.s32 %r565, %r4, %r122;
+ shl.b32 %r163, %r565, 1;
+ add.s32 %r566, %r162, %r163;
+ or.b32 %r567, %r566, 1;
+ setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r569, %r164, %r163;
+ add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
- mul.wide.s32 %rd233, %r569, 2;
+ mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
20: CombinedSchedulerTest.LayerNormBackward/dtype___bfloat_batch_216_hidden_96
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 64
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T59[i13]
= T59[i13]
+ T58[i13];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i15]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T46[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i18]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T47[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<631>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
shl.b32 %r248, %r4, 2;
max.s32 %r249, %r2, %r3;
mad.lo.s32 %r250, %r248, %r249, 15;
and.b32 %r251, %r250, -16;
cvt.u64.u32 %rd2, %r251;
mov.u64 %rd43, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_103395arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r252, %r8, 7;
setp.lt.s32 %p11, %r252, %r202;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
// end inline asm
shl.b32 %r256, %r5, 4;
add.s32 %r254, %r253, %r256;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r255, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r255, 0;
cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r583, %r6, 4;
add.s32 %r257, %r4, 215;
div.s32 %r258, %r257, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r259, %r11, %r258;
add.s32 %r260, %r259, -1;
div.s32 %r12, %r260, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r262, %ctaid.y;
mul.lo.s32 %r263, %r12, %r4;
mul.lo.s32 %r13, %r263, %r262;
shl.b32 %r264, %r9, 1;
mov.u32 %r265, 1;
shl.b32 %r266, %r5, 4;
mad.lo.s32 %r14, %r264, %r202, %r266;
mul.lo.s32 %r267, %r202, %r9;
cvt.s64.s32 %rd52, %r267;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r268, %r13, %r202;
cvt.s64.s32 %rd6, %r268;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r269, %tid.z;
mad.lo.s32 %r270, %r4, %r269, %r9;
mad.lo.s32 %r15, %r270, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r271, %r3;
mov.u32 %r272, 31;
sub.s32 %r273, %r272, %r271;
shl.b32 %r16, %r265, %r273;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r274, %r16, %r5;
setp.lt.u32 %p18, %r274, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r275, %r15, %r16;
mul.wide.s32 %rd55, %r275, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r276, %r16, 31;
add.s32 %r277, %r16, %r276;
shr.s32 %r17, %r277, 1;
add.s32 %r18, %r267, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r18, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r270, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r580, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r14, %r281;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r14, %r284;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r580, %r4;
add.s32 %r279, %r23, %r9;
add.s32 %r24, %r279, %r13;
setp.gt.s32 %p19, %r24, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r24, %r211;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r24, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r578, %r580, %r4;
mul.lo.s32 %r287, %r578, %r202;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r576, %r580, %r4;
add.s32 %r575, %r576, %r9;
add.s32 %r574, %r575, %r13;
setp.gt.s32 %p204, %r574, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r24, %r215;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ mov.b32 %f221, {0,%rs36};}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ mov.b32 %f222, {0,%rs37};}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ mov.b32 %f223, {0,%rs38};}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ mov.b32 %f224, {0,%rs39};}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ mov.b32 %f225, {0,%rs40};}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ mov.b32 %f226, {0,%rs41};}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ mov.b32 %f227, {0,%rs42};}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ mov.b32 %f228, {0,%rs43};}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ mov.b32 %f229, {0,%rs44};}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ mov.b32 %f230, {0,%rs45};}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ mov.b32 %f231, {0,%rs46};}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ mov.b32 %f232, {0,%rs47};}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ mov.b32 %f233, {0,%rs48};}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ mov.b32 %f234, {0,%rs49};}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ mov.b32 %f235, {0,%rs50};}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ mov.b32 %f236, {0,%rs51};}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ mov.b32 %f237, {0,%rs52};}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ mov.b32 %f238, {0,%rs53};}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ mov.b32 %f239, {0,%rs54};}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ mov.b32 %f240, {0,%rs55};}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ mov.b32 %f241, {0,%rs56};}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ mov.b32 %f242, {0,%rs57};}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ mov.b32 %f243, {0,%rs58};}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ mov.b32 %f244, {0,%rs59};}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r583, %r583, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r581, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r581;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r581, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r36, %r581, 1;
setp.gt.u32 %p27, %r581, 3;
mov.u32 %r581, %r36;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r582, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r582;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r582, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r38, %r582, 1;
setp.gt.u32 %p33, %r582, 3;
mov.u32 %r582, %r38;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ mov.b32 %f338, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ mov.b32 %f339, {0,%rs98};}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ mov.b32 %f340, {0,%rs99};}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ mov.b32 %f342, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f343, {0,%rs102};}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ mov.b32 %f344, {0,%rs103};}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ mov.b32 %f346, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ mov.b32 %f347, {0,%rs106};}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ mov.b32 %f348, {0,%rs107};}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ mov.b32 %f350, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f351, {0,%rs110};}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ mov.b32 %f352, {0,%rs111};}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ mov.b32 %f354, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ mov.b32 %f355, {0,%rs114};}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ mov.b32 %f356, {0,%rs115};}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ mov.b32 %f358, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f359, {0,%rs118};}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ mov.b32 %f360, {0,%rs119};}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ mov.b32 %f362, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ mov.b32 %f363, {0,%rs122};}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ mov.b32 %f364, {0,%rs123};}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ mov.b32 %f366, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f367, {0,%rs126};}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ mov.b32 %f368, {0,%rs127};}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
add.s32 %r351, %r13, %r577;
mad.lo.s32 %r352, %r351, %r202, %r18;
mul.wide.s32 %rd82, %r352, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r580, %r580, 1;
setp.lt.s32 %p39, %r580, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r353, %tid.z;
mad.lo.s32 %r354, %r4, %r353, %r9;
mad.lo.s32 %r50, %r354, %r3, %r5;
mul.wide.u32 %rd83, %r50, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r355, %r4;
mov.u32 %r356, 31;
sub.s32 %r51, %r356, %r355;
mov.u32 %r357, 1;
shl.b32 %r614, %r357, %r51;
setp.lt.u32 %p40, %r9, %r614;
add.s32 %r358, %r614, %r9;
setp.lt.u32 %p41, %r358, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r359, %r3, %r51;
add.s32 %r360, %r50, %r359;
mul.wide.s32 %rd85, %r360, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r614, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r584, %r614;
$L__BB0_48:
shr.u32 %r54, %r584, 1;
setp.ge.u32 %p44, %r9, %r54;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r361, %r54, %r3, %r50;
mul.wide.s32 %rd88, %r361, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r584, 7;
mov.u32 %r584, %r54;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r585, 0;
add.s32 %r363, %r50, %r3;
mul.wide.u32 %rd91, %r363, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r585, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r364, %r3, %r51;
add.s32 %r365, %r50, %r364;
mul.wide.s32 %rd93, %r365, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r586, %r614;
$L__BB0_59:
shr.u32 %r58, %r586, 1;
setp.ge.u32 %p50, %r9, %r58;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r366, %r58, %r3, %r50;
mul.wide.s32 %rd96, %r366, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r586, 7;
mov.u32 %r586, %r58;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r587, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r368, %r3, %r51;
add.s32 %r369, %r50, %r368;
mul.wide.s32 %rd99, %r369, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r588, %r614;
$L__BB0_70:
shr.u32 %r62, %r588, 1;
setp.ge.u32 %p56, %r9, %r62;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r370, %r62, %r3, %r50;
mul.wide.s32 %rd102, %r370, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r588, 7;
mov.u32 %r588, %r62;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r589, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r372, %r3, %r51;
add.s32 %r373, %r50, %r372;
mul.wide.s32 %rd105, %r373, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r590, %r614;
$L__BB0_81:
shr.u32 %r66, %r590, 1;
setp.ge.u32 %p62, %r9, %r66;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r374, %r66, %r3, %r50;
mul.wide.s32 %rd108, %r374, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r590, 7;
mov.u32 %r590, %r66;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r591, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r376, %r3, %r51;
add.s32 %r377, %r50, %r376;
mul.wide.s32 %rd111, %r377, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r592, %r614;
$L__BB0_92:
shr.u32 %r70, %r592, 1;
setp.ge.u32 %p68, %r9, %r70;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r378, %r70, %r3, %r50;
mul.wide.s32 %rd114, %r378, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r592, 7;
mov.u32 %r592, %r70;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r593, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r380, %r3, %r51;
add.s32 %r381, %r50, %r380;
mul.wide.s32 %rd117, %r381, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r594, %r614;
$L__BB0_103:
shr.u32 %r74, %r594, 1;
setp.ge.u32 %p74, %r9, %r74;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r382, %r74, %r3, %r50;
mul.wide.s32 %rd120, %r382, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r594, 7;
mov.u32 %r594, %r74;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r595, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r384, %r3, %r51;
add.s32 %r385, %r50, %r384;
mul.wide.s32 %rd123, %r385, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r596, %r614;
$L__BB0_114:
shr.u32 %r78, %r596, 1;
setp.ge.u32 %p80, %r9, %r78;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r386, %r78, %r3, %r50;
mul.wide.s32 %rd126, %r386, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r596, 7;
mov.u32 %r596, %r78;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r597, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r388, %r3, %r51;
add.s32 %r389, %r50, %r388;
mul.wide.s32 %rd129, %r389, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r598, %r614;
$L__BB0_125:
shr.u32 %r82, %r598, 1;
setp.ge.u32 %p86, %r9, %r82;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r390, %r82, %r3, %r50;
mul.wide.s32 %rd132, %r390, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r598, 7;
mov.u32 %r598, %r82;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r599, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r85, %r583, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r392, %r3, %r51;
add.s32 %r393, %r50, %r392;
mul.wide.s32 %rd135, %r393, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r600, %r614;
$L__BB0_136:
shr.u32 %r87, %r600, 1;
setp.ge.u32 %p92, %r9, %r87;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r394, %r87, %r3, %r50;
mul.wide.s32 %rd138, %r394, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r600, 7;
mov.u32 %r600, %r87;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r601, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r396, %r3, %r51;
add.s32 %r397, %r50, %r396;
mul.wide.s32 %rd141, %r397, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r602, %r614;
$L__BB0_147:
shr.u32 %r91, %r602, 1;
setp.ge.u32 %p98, %r9, %r91;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r398, %r91, %r3, %r50;
mul.wide.s32 %rd144, %r398, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r602, 7;
mov.u32 %r602, %r91;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r603, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r400, %r3, %r51;
add.s32 %r401, %r50, %r400;
mul.wide.s32 %rd147, %r401, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r604, %r614;
$L__BB0_158:
shr.u32 %r95, %r604, 1;
setp.ge.u32 %p104, %r9, %r95;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r402, %r95, %r3, %r50;
mul.wide.s32 %rd150, %r402, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r604, 7;
mov.u32 %r604, %r95;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r605, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r404, %r3, %r51;
add.s32 %r405, %r50, %r404;
mul.wide.s32 %rd153, %r405, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r606, %r614;
$L__BB0_169:
shr.u32 %r99, %r606, 1;
setp.ge.u32 %p110, %r9, %r99;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r406, %r99, %r3, %r50;
mul.wide.s32 %rd156, %r406, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r606, 7;
mov.u32 %r606, %r99;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r607, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r408, %r3, %r51;
add.s32 %r409, %r50, %r408;
mul.wide.s32 %rd159, %r409, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r608, %r614;
$L__BB0_180:
shr.u32 %r103, %r608, 1;
setp.ge.u32 %p116, %r9, %r103;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r410, %r103, %r3, %r50;
mul.wide.s32 %rd162, %r410, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r608, 7;
mov.u32 %r608, %r103;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r609, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r412, %r3, %r51;
add.s32 %r413, %r50, %r412;
mul.wide.s32 %rd165, %r413, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r610, %r614;
$L__BB0_191:
shr.u32 %r107, %r610, 1;
setp.ge.u32 %p122, %r9, %r107;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r414, %r107, %r3, %r50;
mul.wide.s32 %rd168, %r414, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r610, 7;
mov.u32 %r610, %r107;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r611, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r611, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r416, %r3, %r51;
add.s32 %r417, %r50, %r416;
mul.wide.s32 %rd171, %r417, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r612, %r614;
$L__BB0_202:
shr.u32 %r111, %r612, 1;
setp.ge.u32 %p128, %r9, %r111;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r418, %r111, %r3, %r50;
mul.wide.s32 %rd174, %r418, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r612, 7;
mov.u32 %r612, %r111;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r613, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r613, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r420, %r3, %r51;
add.s32 %r421, %r50, %r420;
mul.wide.s32 %rd177, %r421, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r115, %r614, 1;
setp.ge.u32 %p134, %r9, %r115;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r422, %r115, %r3, %r50;
mul.wide.s32 %rd180, %r422, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r614, 7;
mov.u32 %r614, %r115;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r615, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r615, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
shl.b32 %r573, %r5, 3;
mov.u32 %r448, %ctaid.y;
mad.lo.s32 %r449, %r202, %r448, %r573;
add.s32 %r450, %r449, %r85;
mul.wide.s32 %rd189, %r450, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
// end inline asm
add.s32 %r451, %r450, 4;
mul.wide.s32 %rd190, %r451, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r424, %r570, 3;
sub.s32 %r118, %r424, %r202;
mov.u32 %r425, %ctaid.y;
mad.lo.s32 %r119, %r202, %r425, %r570;
neg.s32 %r426, %r85;
setp.ge.s32 %p141, %r118, %r426;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r431, %r119, %r85;
mul.wide.s32 %rd184, %r431, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
// end inline asm
$L__BB0_222:
mov.u32 %r432, -4;
sub.s32 %r433, %r432, %r85;
setp.ge.s32 %p143, %r118, %r433;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r438, %r119, %r85;
add.s32 %r439, %r438, 4;
mul.wide.s32 %rd186, %r439, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
// end inline asm
$L__BB0_226:
shl.b32 %r120, %r583, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
shl.b32 %r572, %r5, 3;
mov.u32 %r476, %ctaid.y;
mad.lo.s32 %r477, %r202, %r476, %r572;
add.s32 %r478, %r477, %r120;
mul.wide.s32 %rd197, %r478, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
// end inline asm
add.s32 %r479, %r478, 4;
mul.wide.s32 %rd198, %r479, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r452, %r571, 3;
sub.s32 %r121, %r452, %r202;
mov.u32 %r453, %ctaid.y;
mad.lo.s32 %r122, %r202, %r453, %r571;
neg.s32 %r454, %r120;
setp.ge.s32 %p150, %r121, %r454;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r459, %r122, %r120;
mul.wide.s32 %rd192, %r459, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
// end inline asm
$L__BB0_229:
mov.u32 %r460, -4;
sub.s32 %r461, %r460, %r120;
setp.ge.s32 %p152, %r121, %r461;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r466, %r122, %r120;
add.s32 %r467, %r466, 4;
mul.wide.s32 %rd194, %r467, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
// end inline asm
$L__BB0_233:
mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r480, %r5, %r9;
or.b32 %r482, %r480, %r353;
setp.ne.s32 %p156, %r482, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r483, %ctaid.x;
mov.u32 %r484, %ctaid.z;
mov.u32 %r485, %nctaid.x;
mad.lo.s32 %r486, %r484, %r485, %r483;
mul.wide.s32 %rd200, %r486, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r487, %r11, -1;
setp.eq.s32 %p157, %r123, %r487;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r616, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r616;
// end inline asm
setp.lt.u32 %p159, %r616, 256;
selp.u32 %r490, 1, 0, %p159;
shl.b32 %r616, %r616, %r490;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r491, %r11, %r3;
add.s32 %r492, %r491, -1;
div.s32 %r126, %r492, %r3;
setp.lt.s32 %p161, %r126, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r494, %r202, 1;
shr.u32 %r495, %r494, 31;
add.s32 %r496, %r494, %r495;
shr.s32 %r497, %r496, 1;
add.s32 %r498, %r4, %r497;
add.s32 %r499, %r498, -1;
shl.b32 %r500, %r9, 1;
shl.b32 %r501, %r4, 1;
mad.lo.s32 %r502, %r501, %r123, %r500;
or.b32 %r503, %r502, 1;
setp.ge.s32 %p162, %r503, %r202;
div.s32 %r504, %r499, %r4;
setp.ge.s32 %p163, %r123, %r504;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r505, %r4, %r123;
shl.b32 %r506, %r505, 1;
mad.lo.s32 %r507, %r202, %r5, %r506;
add.s32 %r618, %r507, %r500;
mul.lo.s32 %r128, %r202, %r3;
mov.u32 %r493, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r617, %r5;
mov.u32 %r619, %r493;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r617, %r11;
mov.u32 %r620, %r493;
mov.u32 %r621, %r493;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r618, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r621;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r620;
add.f32 %f678, %f678, %f559;
add.s32 %r618, %r618, %r128;
add.s32 %r617, %r617, %r3;
add.s32 %r619, %r619, 1;
setp.lt.s32 %p165, %r619, %r126;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r514, %r3;
mov.u32 %r515, 31;
sub.s32 %r516, %r515, %r514;
mov.u32 %r517, 1;
shl.b32 %r139, %r517, %r516;
setp.lt.u32 %p166, %r5, %r139;
add.s32 %r518, %r139, %r5;
setp.lt.u32 %p167, %r518, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r519, %r50, %r139;
mul.wide.s32 %rd211, %r519, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r520, %r139, 31;
add.s32 %r521, %r139, %r520;
shr.s32 %r630, %r521, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r139, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r622, %r630;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r622;
@%p170 bra $L__BB0_249;
add.s32 %r522, %r622, %r50;
mul.wide.s32 %rd213, %r522, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r142, %r622, 1;
setp.gt.u32 %p171, %r622, 3;
mov.u32 %r622, %r142;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r523, %r50, 1;
mul.wide.u32 %rd216, %r523, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r623, %r630;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r623;
@%p176 bra $L__BB0_259;
add.s32 %r524, %r623, %r50;
mul.wide.s32 %rd218, %r524, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r144, %r623, 1;
setp.gt.u32 %p177, %r623, 3;
mov.u32 %r623, %r144;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r525, %r202, 1;
shr.u32 %r526, %r525, 31;
add.s32 %r527, %r525, %r526;
shr.s32 %r528, %r527, 1;
add.s32 %r529, %r4, %r528;
add.s32 %r530, %r529, -1;
div.s32 %r531, %r530, %r4;
setp.ge.s32 %p181, %r123, %r531;
@%p181 bra $L__BB0_267;
shl.b32 %r145, %r9, 1;
mul.lo.s32 %r532, %r4, %r123;
shl.b32 %r146, %r532, 1;
add.s32 %r533, %r145, %r146;
or.b32 %r534, %r533, 1;
setp.ge.s32 %p182, %r534, %r202;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r535, %r146, %r145;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r535, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r537, %r202, 1;
shr.u32 %r538, %r537, 31;
add.s32 %r539, %r537, %r538;
shr.s32 %r540, %r539, 1;
add.s32 %r541, %r4, %r540;
add.s32 %r542, %r541, -1;
shl.b32 %r543, %r9, 1;
shl.b32 %r544, %r4, 1;
mad.lo.s32 %r545, %r544, %r123, %r543;
or.b32 %r546, %r545, 1;
setp.ge.s32 %p184, %r546, %r202;
div.s32 %r547, %r542, %r4;
setp.ge.s32 %p185, %r123, %r547;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r548, %r4, %r123;
shl.b32 %r549, %r548, 1;
mad.lo.s32 %r550, %r202, %r5, %r549;
add.s32 %r625, %r550, %r543;
mul.lo.s32 %r148, %r202, %r3;
mov.u32 %r536, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r624, %r5;
mov.u32 %r626, %r536;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r624, %r11;
mov.u32 %r627, %r536;
mov.u32 %r628, %r536;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r625, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r628;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r627;
add.f32 %f684, %f684, %f585;
add.s32 %r625, %r625, %r148;
add.s32 %r624, %r624, %r3;
add.s32 %r626, %r626, 1;
setp.lt.s32 %p187, %r626, %r126;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r629, %r630;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r629;
@%p190 bra $L__BB0_279;
add.s32 %r557, %r629, %r50;
mul.wide.s32 %rd226, %r557, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r160, %r629, 1;
setp.gt.u32 %p191, %r629, 3;
mov.u32 %r629, %r160;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r630;
@%p196 bra $L__BB0_288;
add.s32 %r558, %r630, %r50;
mul.wide.s32 %rd229, %r558, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r162, %r630, 1;
setp.gt.u32 %p197, %r630, 3;
mov.u32 %r630, %r162;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r559, %r202, 1;
shr.u32 %r560, %r559, 31;
add.s32 %r561, %r559, %r560;
shr.s32 %r562, %r561, 1;
add.s32 %r563, %r4, %r562;
add.s32 %r564, %r563, -1;
div.s32 %r565, %r564, %r4;
setp.ge.s32 %p201, %r123, %r565;
@%p201 bra $L__BB0_296;
shl.b32 %r163, %r9, 1;
mul.lo.s32 %r566, %r4, %r123;
shl.b32 %r164, %r566, 1;
add.s32 %r567, %r163, %r164;
or.b32 %r568, %r567, 1;
setp.ge.s32 %p202, %r568, %r202;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_20c09547_1033910nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r569, %r164, %r163;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r569, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r200, %r201}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r210, %r211}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r214, %r215}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r236, %r201, 7;
shr.s32 %r237, %r236, 31;
shr.u32 %r238, %r237, 29;
add.s32 %r239, %r236, %r238;
shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
mov.u32 %r240, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r242, %r4, %r2;
shl.b32 %r243, %r242, 4;
or.b32 %r244, %r243, 15;
and.b32 %r7, %r244, -16;
add.s32 %r245, %r244, %r7;
and.b32 %r246, %r245, -16;
cvt.s64.s32 %rd1, %r246;
shl.b32 %r247, %r4, 2;
max.s32 %r248, %r2, %r3;
mad.lo.s32 %r249, %r247, %r248, 15;
and.b32 %r250, %r249, -16;
cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_72335arrayE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r251, %r8, 7;
setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
// end inline asm
shl.b32 %r255, %r5, 4;
add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
mov.u32 %r254, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r254, 0;
cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r577, %r6, 4;
add.s32 %r256, %r4, 215;
div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r258, %r11, %r257;
add.s32 %r259, %r258, -1;
div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
mov.u32 %r261, %ctaid.y;
mul.lo.s32 %r262, %r12, %r4;
mul.lo.s32 %r13, %r262, %r261;
mad.lo.s32 %r263, %r2, %r9, %r5;
shl.b32 %r14, %r263, 4;
mul.lo.s32 %r264, %r201, %r9;
cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
mul.lo.s32 %r265, %r13, %r201;
cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r266, %tid.z;
mad.lo.s32 %r267, %r4, %r266, %r9;
mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
clz.b32 %r268, %r3;
mov.u32 %r269, 31;
sub.s32 %r270, %r269, %r268;
mov.u32 %r271, 1;
shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
add.s32 %r272, %r16, %r5;
setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
add.s32 %r273, %r15, %r16;
mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
shr.u32 %r274, %r16, 31;
add.s32 %r275, %r16, %r274;
shr.s32 %r17, %r275, 1;
shl.b32 %r276, %r9, 3;
mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
// end inline asm
add.s32 %r282, %r281, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
// end inline asm
add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r574, %r4;
add.s32 %r279, %r22, %r9;
add.s32 %r23, %r279, %r13;
setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
mul.lo.s32 %r572, %r574, %r4;
mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
mov.u32 %r286, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r282], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd31, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r286, 0;
cp.async.ca.shared.global [%r285], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r571, %r574, %r4;
add.s32 %r570, %r571, %r9;
add.s32 %r569, %r570, %r13;
setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r293, %r294, %r295, %r296}, [%rd9];
ld.shared.v4.u32 {%r301, %r302, %r303, %r304}, [%rd10];
ld.shared.v4.u32 {%r309, %r310, %r311, %r312}, [%rd12];
mov.b32 {%rs36, %rs39}, %r309;
// begin inline asm
{ mov.b32 %f221, {0,%rs36};}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r301;
// begin inline asm
{ mov.b32 %f222, {0,%rs37};}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r293;
// begin inline asm
{ mov.b32 %f223, {0,%rs38};}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ mov.b32 %f224, {0,%rs39};}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ mov.b32 %f225, {0,%rs40};}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ mov.b32 %f226, {0,%rs41};}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r310;
// begin inline asm
{ mov.b32 %f227, {0,%rs42};}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r302;
// begin inline asm
{ mov.b32 %f228, {0,%rs43};}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r294;
// begin inline asm
{ mov.b32 %f229, {0,%rs44};}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ mov.b32 %f230, {0,%rs45};}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ mov.b32 %f231, {0,%rs46};}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ mov.b32 %f232, {0,%rs47};}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r311;
// begin inline asm
{ mov.b32 %f233, {0,%rs48};}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r303;
// begin inline asm
{ mov.b32 %f234, {0,%rs49};}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r295;
// begin inline asm
{ mov.b32 %f235, {0,%rs50};}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ mov.b32 %f236, {0,%rs51};}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ mov.b32 %f237, {0,%rs52};}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ mov.b32 %f238, {0,%rs53};}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r312;
// begin inline asm
{ mov.b32 %f239, {0,%rs54};}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r304;
// begin inline asm
{ mov.b32 %f240, {0,%rs55};}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r296;
// begin inline asm
{ mov.b32 %f241, {0,%rs56};}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ mov.b32 %f242, {0,%rs57};}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ mov.b32 %f243, {0,%rs58};}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ mov.b32 %f244, {0,%rs59};}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
mov.u32 %r575, %r17;
$L__BB0_20:
setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r35, %r575, 1;
setp.gt.u32 %p27, %r575, 3;
mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
setp.lt.u32 %p29, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p29 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p24 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
mov.u32 %r576, %r17;
$L__BB0_30:
setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r37, %r576, 1;
setp.gt.u32 %p33, %r576, 3;
mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
setp.lt.u32 %p35, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p35 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p10 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p10 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
// begin inline asm
{ mov.b32 %f338, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r343;
// begin inline asm
{ mov.b32 %f339, {0,%rs98};}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r327;
// begin inline asm
{ mov.b32 %f340, {0,%rs99};}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ mov.b32 %f342, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f343, {0,%rs102};}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ mov.b32 %f344, {0,%rs103};}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r323, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r336;
// begin inline asm
{ mov.b32 %f346, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r344;
// begin inline asm
{ mov.b32 %f347, {0,%rs106};}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r328;
// begin inline asm
{ mov.b32 %f348, {0,%rs107};}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ mov.b32 %f350, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f351, {0,%rs110};}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ mov.b32 %f352, {0,%rs111};}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r324, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r337;
// begin inline asm
{ mov.b32 %f354, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r345;
// begin inline asm
{ mov.b32 %f355, {0,%rs114};}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r329;
// begin inline asm
{ mov.b32 %f356, {0,%rs115};}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ mov.b32 %f358, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f359, {0,%rs118};}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ mov.b32 %f360, {0,%rs119};}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r325, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r338;
// begin inline asm
{ mov.b32 %f362, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r346;
// begin inline asm
{ mov.b32 %f363, {0,%rs122};}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r330;
// begin inline asm
{ mov.b32 %f364, {0,%rs123};}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ mov.b32 %f366, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f367, {0,%rs126};}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ mov.b32 %f368, {0,%rs127};}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r326, {%rs124, %rs128};
mad.lo.s32 %r351, %r23, %r201, %r8;
mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
// begin inline asm
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r574, %r574, 1;
setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r352, %tid.z;
mad.lo.s32 %r353, %r4, %r352, %r9;
mad.lo.s32 %r49, %r353, %r3, %r5;
mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r354, %r4;
mov.u32 %r355, 31;
sub.s32 %r50, %r355, %r354;
mov.u32 %r356, 1;
shl.b32 %r608, %r356, %r50;
setp.lt.u32 %p40, %r9, %r608;
add.s32 %r357, %r608, %r9;
setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
shl.b32 %r358, %r3, %r50;
add.s32 %r359, %r49, %r358;
mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
mov.u32 %r578, %r608;
$L__BB0_48:
shr.u32 %r53, %r578, 1;
setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
mad.lo.s32 %r360, %r53, %r3, %r49;
mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p45, %r578, 7;
mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r579, 0;
add.s32 %r362, %r49, %r3;
mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p47, %r4, 2;
@%p47 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
shl.b32 %r363, %r3, %r50;
add.s32 %r364, %r49, %r363;
mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
mov.u32 %r580, %r608;
$L__BB0_59:
shr.u32 %r57, %r580, 1;
setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
mad.lo.s32 %r365, %r57, %r3, %r49;
mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p51, %r580, 7;
mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@%p53 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
shl.b32 %r367, %r3, %r50;
add.s32 %r368, %r49, %r367;
mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
mov.u32 %r582, %r608;
$L__BB0_70:
shr.u32 %r61, %r582, 1;
setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
mad.lo.s32 %r369, %r61, %r3, %r49;
mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p57, %r582, 7;
mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@%p59 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
shl.b32 %r371, %r3, %r50;
add.s32 %r372, %r49, %r371;
mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
mov.u32 %r584, %r608;
$L__BB0_81:
shr.u32 %r65, %r584, 1;
setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
mad.lo.s32 %r373, %r65, %r3, %r49;
mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p63, %r584, 7;
mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@%p65 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
shl.b32 %r375, %r3, %r50;
add.s32 %r376, %r49, %r375;
mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
mov.u32 %r586, %r608;
$L__BB0_92:
shr.u32 %r69, %r586, 1;
setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
mad.lo.s32 %r377, %r69, %r3, %r49;
mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p69, %r586, 7;
mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@%p71 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
shl.b32 %r379, %r3, %r50;
add.s32 %r380, %r49, %r379;
mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
mov.u32 %r588, %r608;
$L__BB0_103:
shr.u32 %r73, %r588, 1;
setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
mad.lo.s32 %r381, %r73, %r3, %r49;
mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p75, %r588, 7;
mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@%p77 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
shl.b32 %r383, %r3, %r50;
add.s32 %r384, %r49, %r383;
mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
mov.u32 %r590, %r608;
$L__BB0_114:
shr.u32 %r77, %r590, 1;
setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
mad.lo.s32 %r385, %r77, %r3, %r49;
mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p81, %r590, 7;
mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@%p83 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
shl.b32 %r387, %r3, %r50;
add.s32 %r388, %r49, %r387;
mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
mov.u32 %r592, %r608;
$L__BB0_125:
shr.u32 %r81, %r592, 1;
setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
mad.lo.s32 %r389, %r81, %r3, %r49;
mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p87, %r592, 7;
mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@%p89 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
shl.b32 %r391, %r3, %r50;
add.s32 %r392, %r49, %r391;
mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
mov.u32 %r594, %r608;
$L__BB0_136:
shr.u32 %r86, %r594, 1;
setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
mad.lo.s32 %r393, %r86, %r3, %r49;
mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p93, %r594, 7;
mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@%p95 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
shl.b32 %r395, %r3, %r50;
add.s32 %r396, %r49, %r395;
mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
mov.u32 %r596, %r608;
$L__BB0_147:
shr.u32 %r90, %r596, 1;
setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
mad.lo.s32 %r397, %r90, %r3, %r49;
mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p99, %r596, 7;
mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@%p101 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
shl.b32 %r399, %r3, %r50;
add.s32 %r400, %r49, %r399;
mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
mov.u32 %r598, %r608;
$L__BB0_158:
shr.u32 %r94, %r598, 1;
setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
mad.lo.s32 %r401, %r94, %r3, %r49;
mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p105, %r598, 7;
mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@%p107 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
shl.b32 %r403, %r3, %r50;
add.s32 %r404, %r49, %r403;
mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
mov.u32 %r600, %r608;
$L__BB0_169:
shr.u32 %r98, %r600, 1;
setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
mad.lo.s32 %r405, %r98, %r3, %r49;
mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p111, %r600, 7;
mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@%p113 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
shl.b32 %r407, %r3, %r50;
add.s32 %r408, %r49, %r407;
mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
mov.u32 %r602, %r608;
$L__BB0_180:
shr.u32 %r102, %r602, 1;
setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
mad.lo.s32 %r409, %r102, %r3, %r49;
mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p117, %r602, 7;
mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@%p119 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
shl.b32 %r411, %r3, %r50;
add.s32 %r412, %r49, %r411;
mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
mov.u32 %r604, %r608;
$L__BB0_191:
shr.u32 %r106, %r604, 1;
setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
mad.lo.s32 %r413, %r106, %r3, %r49;
mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p123, %r604, 7;
mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@%p125 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
shl.b32 %r415, %r3, %r50;
add.s32 %r416, %r49, %r415;
mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
mov.u32 %r606, %r608;
$L__BB0_202:
shr.u32 %r110, %r606, 1;
setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
mad.lo.s32 %r417, %r110, %r3, %r49;
mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p129, %r606, 7;
mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@%p131 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
shl.b32 %r419, %r3, %r50;
add.s32 %r420, %r49, %r419;
mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r114, %r608, 1;
setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
mad.lo.s32 %r421, %r114, %r3, %r49;
mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p135, %r608, 7;
mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@%p137 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
mov.u32 %r447, %ctaid.y;
mad.lo.s32 %r448, %r201, %r447, %r8;
add.s32 %r449, %r448, %r84;
mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
// end inline asm
add.s32 %r450, %r449, 4;
mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
// begin inline asm
st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
add.s32 %r423, %r8, 3;
sub.s32 %r117, %r423, %r201;
mov.u32 %r424, %ctaid.y;
mad.lo.s32 %r118, %r201, %r424, %r8;
neg.s32 %r425, %r84;
setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
add.s32 %r430, %r118, %r84;
mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
// end inline asm
$L__BB0_222:
mov.u32 %r431, -4;
sub.s32 %r432, %r431, %r84;
setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
add.s32 %r437, %r118, %r84;
add.s32 %r438, %r437, 4;
mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
// end inline asm
$L__BB0_226:
shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
mov.u32 %r475, %ctaid.y;
mad.lo.s32 %r476, %r201, %r475, %r8;
add.s32 %r477, %r476, %r119;
mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
// end inline asm
add.s32 %r478, %r477, 4;
mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
// begin inline asm
st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
// end inline asm
bra.uni $L__BB0_233;
$L__BB0_227:
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
add.s32 %r451, %r8, 3;
sub.s32 %r120, %r451, %r201;
mov.u32 %r452, %ctaid.y;
mad.lo.s32 %r121, %r201, %r452, %r8;
neg.s32 %r453, %r119;
setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
add.s32 %r458, %r121, %r119;
mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
// end inline asm
$L__BB0_229:
mov.u32 %r459, -4;
sub.s32 %r460, %r459, %r119;
setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
add.s32 %r465, %r121, %r119;
add.s32 %r466, %r465, 4;
mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
// end inline asm
$L__BB0_233:
mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r479, %r5, %r9;
or.b32 %r481, %r479, %r352;
setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
mov.u32 %r482, %ctaid.x;
mov.u32 %r483, %ctaid.z;
mov.u32 %r484, %nctaid.x;
mad.lo.s32 %r485, %r483, %r484, %r482;
mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
add.s32 %r486, %r11, -1;
setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
mov.u32 %r610, 8;
$L__BB0_236:
// begin inline asm
nanosleep.u32 %r610;
// end inline asm
setp.lt.u32 %p159, %r610, 256;
selp.u32 %r489, 1, 0, %p159;
shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
add.s32 %r490, %r11, %r3;
add.s32 %r491, %r490, -1;
div.s32 %r125, %r491, %r3;
setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
add.s32 %r493, %r201, 1;
shr.u32 %r494, %r493, 31;
add.s32 %r495, %r493, %r494;
shr.s32 %r496, %r495, 1;
add.s32 %r497, %r4, %r496;
add.s32 %r498, %r497, -1;
shl.b32 %r499, %r9, 1;
shl.b32 %r500, %r4, 1;
mad.lo.s32 %r501, %r500, %r122, %r499;
or.b32 %r502, %r501, 1;
setp.ge.s32 %p162, %r502, %r201;
div.s32 %r503, %r498, %r4;
setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
mul.lo.s32 %r504, %r4, %r122;
shl.b32 %r505, %r504, 1;
mad.lo.s32 %r506, %r201, %r5, %r505;
add.s32 %r612, %r506, %r499;
mul.lo.s32 %r127, %r201, %r3;
mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r611, %r5;
mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
setp.ge.s32 %p164, %r611, %r11;
mov.u32 %r614, %r492;
mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
// begin inline asm
ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
// end inline asm
$L__BB0_242:
mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
add.s32 %r612, %r612, %r127;
add.s32 %r611, %r611, %r3;
add.s32 %r613, %r613, 1;
setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
clz.b32 %r513, %r3;
mov.u32 %r514, 31;
sub.s32 %r515, %r514, %r513;
mov.u32 %r516, 1;
shl.b32 %r138, %r516, %r515;
setp.lt.u32 %p166, %r5, %r138;
add.s32 %r517, %r138, %r5;
setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
add.s32 %r518, %r49, %r138;
mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
shr.u32 %r519, %r138, 31;
add.s32 %r520, %r138, %r519;
shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
mov.u32 %r616, %r624;
$L__BB0_247:
setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
add.s32 %r521, %r616, %r49;
mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
shr.u32 %r141, %r616, 1;
setp.gt.u32 %p171, %r616, 3;
mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
add.s32 %r522, %r49, 1;
mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p173 bra $L__BB0_253;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_253:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p168 bra $L__BB0_255;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
mov.u32 %r617, %r624;
$L__BB0_257:
setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
add.s32 %r523, %r617, %r49;
mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
shr.u32 %r143, %r617, 1;
setp.gt.u32 %p177, %r617, 3;
mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
setp.lt.u32 %p179, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p179 bra $L__BB0_263;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_263:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f681;}
// end inline asm
@%p10 bra $L__BB0_267;
add.s32 %r524, %r201, 1;
shr.u32 %r525, %r524, 31;
add.s32 %r526, %r524, %r525;
shr.s32 %r527, %r526, 1;
add.s32 %r528, %r4, %r527;
add.s32 %r529, %r528, -1;
div.s32 %r530, %r529, %r4;
setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
shl.b32 %r144, %r9, 1;
mul.lo.s32 %r531, %r4, %r122;
shl.b32 %r145, %r531, 1;
add.s32 %r532, %r144, %r145;
or.b32 %r533, %r532, 1;
setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
add.s32 %r536, %r201, 1;
shr.u32 %r537, %r536, 31;
add.s32 %r538, %r536, %r537;
shr.s32 %r539, %r538, 1;
add.s32 %r540, %r4, %r539;
add.s32 %r541, %r540, -1;
shl.b32 %r542, %r9, 1;
shl.b32 %r543, %r4, 1;
mad.lo.s32 %r544, %r543, %r122, %r542;
or.b32 %r545, %r544, 1;
setp.ge.s32 %p184, %r545, %r201;
div.s32 %r546, %r541, %r4;
setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
mul.lo.s32 %r547, %r4, %r122;
shl.b32 %r548, %r547, 1;
mad.lo.s32 %r549, %r201, %r5, %r548;
add.s32 %r619, %r549, %r542;
mul.lo.s32 %r147, %r201, %r3;
mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r618, %r5;
mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
setp.ge.s32 %p186, %r618, %r11;
mov.u32 %r621, %r535;
mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
// end inline asm
$L__BB0_272:
mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
add.s32 %r619, %r619, %r147;
add.s32 %r618, %r618, %r3;
add.s32 %r620, %r620, 1;
setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p168 bra $L__BB0_275;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
mov.u32 %r623, %r624;
$L__BB0_277:
setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
add.s32 %r556, %r623, %r49;
mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
shr.u32 %r159, %r623, 1;
setp.gt.u32 %p191, %r623, 3;
mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
setp.lt.u32 %p193, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p193 bra $L__BB0_283;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_283:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p168 bra $L__BB0_285;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
add.s32 %r557, %r624, %r49;
mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
shr.u32 %r161, %r624, 1;
setp.gt.u32 %p197, %r624, 3;
mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
setp.lt.u32 %p199, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p199 bra $L__BB0_292;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_292:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f687;}
// end inline asm
@%p10 bra $L__BB0_296;
add.s32 %r558, %r201, 1;
shr.u32 %r559, %r558, 31;
add.s32 %r560, %r558, %r559;
shr.s32 %r561, %r560, 1;
add.s32 %r562, %r4, %r561;
add.s32 %r563, %r562, -1;
div.s32 %r564, %r563, %r4;
setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
shl.b32 %r162, %r9, 1;
mul.lo.s32 %r565, %r4, %r122;
shl.b32 %r163, %r565, 1;
add.s32 %r566, %r162, %r163;
or.b32 %r567, %r566, 1;
setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_42_cu_571ec8e8_723310nvfuser_42ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,173 +32,173 @@
)
{
.reg .pred %p<205>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
- .reg .b32 %r<631>;
+ .reg .b32 %r<625>;
.reg .f64 %fd<3>;
.reg .b64 %rd<238>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r200, %r201}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r210, %r211}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r214, %r215}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd39, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd31, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r237, %r202, 7;
- shr.s32 %r238, %r237, 31;
- shr.u32 %r239, %r238, 29;
- add.s32 %r240, %r237, %r239;
- shr.s32 %r2, %r240, 3;
+ add.s32 %r236, %r201, 7;
+ shr.s32 %r237, %r236, 31;
+ shr.u32 %r238, %r237, 29;
+ add.s32 %r239, %r236, %r238;
+ shr.s32 %r2, %r239, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p10, %r5, 0;
@%p10 bra $L__BB0_2;
- mov.u32 %r241, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
+ mov.u32 %r240, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r240;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd42, _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r242, [%rd42], %r5;
+ atom.shared.min.s32 %r241, [%rd42], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r243, %r4, %r2;
- shl.b32 %r244, %r243, 4;
- or.b32 %r245, %r244, 15;
- and.b32 %r7, %r245, -16;
- add.s32 %r246, %r245, %r7;
- and.b32 %r247, %r246, -16;
- cvt.s64.s32 %rd1, %r247;
- shl.b32 %r248, %r4, 2;
- max.s32 %r249, %r2, %r3;
- mad.lo.s32 %r250, %r248, %r249, 15;
- and.b32 %r251, %r250, -16;
- cvt.u64.u32 %rd2, %r251;
+ mul.lo.s32 %r242, %r4, %r2;
+ shl.b32 %r243, %r242, 4;
+ or.b32 %r244, %r243, 15;
+ and.b32 %r7, %r244, -16;
+ add.s32 %r245, %r244, %r7;
+ and.b32 %r246, %r245, -16;
+ cvt.s64.s32 %rd1, %r246;
+ shl.b32 %r247, %r4, 2;
+ max.s32 %r248, %r2, %r3;
+ mad.lo.s32 %r249, %r247, %r248, 15;
+ and.b32 %r250, %r249, -16;
+ cvt.u64.u32 %rd2, %r250;
mov.u64 %rd43, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd44, %rd43;
add.s64 %rd3, %rd44, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r252, %r8, 7;
- setp.lt.s32 %p11, %r252, %r202;
+ or.b32 %r251, %r8, 7;
+ setp.lt.s32 %p11, %r251, %r201;
setp.lt.s32 %p12, %r5, %r2;
and.pred %p1, %p11, %p12;
not.pred %p13, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p14, %r9, 0;
or.pred %p15, %p14, %p13;
@%p15 bra $L__BB0_4;
add.s64 %rd45, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r253, smem_ptr; }
-
-
- shl.b32 %r256, %r5, 4;
- add.s32 %r254, %r253, %r256;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd45; cvt.u32.u64 %r252, smem_ptr; }
+
+
+ shl.b32 %r255, %r5, 4;
+ add.s32 %r253, %r252, %r255;
mul.wide.s32 %rd47, %r8, 2;
add.s64 %rd46, %rd35, %rd47;
- mov.u32 %r255, 0;
+ mov.u32 %r254, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r255, 0;
- cp.async.ca.shared.global [%r254], [%rd46], 16, p0;
+ setp.ne.b32 p0, %r254, 0;
+ cp.async.ca.shared.global [%r253], [%rd46], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r583, %r6, 4;
- add.s32 %r257, %r4, 215;
- div.s32 %r258, %r257, %r4;
+ shl.b32 %r577, %r6, 4;
+ add.s32 %r256, %r4, 215;
+ div.s32 %r257, %r256, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r259, %r11, %r258;
- add.s32 %r260, %r259, -1;
- div.s32 %r12, %r260, %r11;
+ add.s32 %r258, %r11, %r257;
+ add.s32 %r259, %r258, -1;
+ div.s32 %r12, %r259, %r11;
setp.gt.s32 %p16, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p16 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r202;
+ cvt.rn.f64.s32 %fd1, %r201;
cvt.s64.s32 %rd48, %r7;
add.s64 %rd49, %rd48, %rd2;
add.s64 %rd51, %rd43, %rd2;
- mov.u32 %r262, %ctaid.y;
- mul.lo.s32 %r263, %r12, %r4;
- mul.lo.s32 %r13, %r263, %r262;
- shl.b32 %r264, %r9, 1;
- mov.u32 %r265, 1;
- shl.b32 %r266, %r5, 4;
- mad.lo.s32 %r14, %r264, %r202, %r266;
- mul.lo.s32 %r267, %r202, %r9;
- cvt.s64.s32 %rd52, %r267;
+ mov.u32 %r261, %ctaid.y;
+ mul.lo.s32 %r262, %r12, %r4;
+ mul.lo.s32 %r13, %r262, %r261;
+ mad.lo.s32 %r263, %r2, %r9, %r5;
+ shl.b32 %r14, %r263, 4;
+ mul.lo.s32 %r264, %r201, %r9;
+ cvt.s64.s32 %rd52, %r264;
cvt.s64.s32 %rd53, %r8;
add.s64 %rd5, %rd52, %rd53;
- mul.lo.s32 %r268, %r13, %r202;
- cvt.s64.s32 %rd6, %r268;
+ mul.lo.s32 %r265, %r13, %r201;
+ cvt.s64.s32 %rd6, %r265;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- mov.u32 %r269, %tid.z;
- mad.lo.s32 %r270, %r4, %r269, %r9;
- mad.lo.s32 %r15, %r270, %r3, %r5;
+ mov.u32 %r266, %tid.z;
+ mad.lo.s32 %r267, %r4, %r266, %r9;
+ mad.lo.s32 %r15, %r267, %r3, %r5;
mul.wide.u32 %rd54, %r15, 4;
add.s64 %rd7, %rd43, %rd54;
- clz.b32 %r271, %r3;
- mov.u32 %r272, 31;
- sub.s32 %r273, %r272, %r271;
- shl.b32 %r16, %r265, %r273;
+ clz.b32 %r268, %r3;
+ mov.u32 %r269, 31;
+ sub.s32 %r270, %r269, %r268;
+ mov.u32 %r271, 1;
+ shl.b32 %r16, %r271, %r270;
setp.lt.u32 %p17, %r5, %r16;
- add.s32 %r274, %r16, %r5;
- setp.lt.u32 %p18, %r274, %r3;
+ add.s32 %r272, %r16, %r5;
+ setp.lt.u32 %p18, %r272, %r3;
and.pred %p2, %p17, %p18;
- add.s32 %r275, %r15, %r16;
- mul.wide.s32 %rd55, %r275, 4;
+ add.s32 %r273, %r15, %r16;
+ mul.wide.s32 %rd55, %r273, 4;
add.s64 %rd8, %rd43, %rd55;
- shr.u32 %r276, %r16, 31;
- add.s32 %r277, %r16, %r276;
- shr.s32 %r17, %r277, 1;
- add.s32 %r18, %r267, %r8;
+ shr.u32 %r274, %r16, 31;
+ add.s32 %r275, %r16, %r274;
+ shr.s32 %r17, %r275, 1;
+ shl.b32 %r276, %r9, 3;
+ mad.lo.s32 %r277, %r276, %r2, %r8;
add.s64 %rd56, %rd43, %rd49;
- mul.wide.s32 %rd57, %r18, 2;
+ mul.wide.s32 %rd57, %r277, 2;
add.s64 %rd9, %rd56, %rd57;
add.s64 %rd58, %rd43, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd10, %rd58, %rd59;
add.s32 %r278, %r15, 1;
mul.wide.u32 %rd60, %r278, 4;
add.s64 %rd11, %rd43, %rd60;
add.s64 %rd12, %rd51, %rd57;
- mul.wide.s32 %rd61, %r270, 4;
+ mul.wide.s32 %rd61, %r267, 4;
add.s64 %rd13, %rd43, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd44, %rd49;
cvta.to.global.u64 %rd16, %rd34;
cvta.to.global.u64 %rd17, %rd33;
- mov.u32 %r580, 0;
+ mov.u32 %r574, 0;
mov.f32 %f187, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r281, smem_ptr; }
- add.s32 %r282, %r14, %r281;
+ add.s32 %r282, %r281, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r284, smem_ptr; }
- add.s32 %r285, %r14, %r284;
+ add.s32 %r285, %r284, %r14;
not.pred %p24, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
@@ -215,30 +215,30 @@
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r580, %r4;
- add.s32 %r279, %r23, %r9;
- add.s32 %r24, %r279, %r13;
- setp.gt.s32 %p19, %r24, 215;
+ mul.lo.s32 %r22, %r574, %r4;
+ add.s32 %r279, %r22, %r9;
+ add.s32 %r23, %r279, %r13;
+ setp.gt.s32 %p19, %r23, 215;
mov.f32 %f622, %f187;
@%p19 bra $L__BB0_9;
- mul.lo.s32 %r280, %r24, %r211;
+ mul.lo.s32 %r280, %r23, %r210;
mul.wide.s32 %rd63, %r280, 4;
add.s64 %rd64, %rd17, %rd63;
ld.global.f32 %f622, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p20, %r24, 216;
+ setp.lt.s32 %p20, %r23, 216;
and.pred %p3, %p1, %p20;
not.pred %p21, %p3;
@%p21 bra $L__BB0_11;
- mul.lo.s32 %r578, %r580, %r4;
- mul.lo.s32 %r287, %r578, %r202;
+ mul.lo.s32 %r572, %r574, %r4;
+ mul.lo.s32 %r287, %r572, %r201;
cvt.s64.s32 %rd69, %r287;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd32, %rd72;
@@ -260,19 +260,19 @@
}
$L__BB0_11:
- mul.lo.s32 %r576, %r580, %r4;
- add.s32 %r575, %r576, %r9;
- add.s32 %r574, %r575, %r13;
- setp.gt.s32 %p204, %r574, 215;
+ mul.lo.s32 %r571, %r574, %r4;
+ add.s32 %r570, %r571, %r9;
+ add.s32 %r569, %r570, %r13;
+ setp.gt.s32 %p204, %r569, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p204 bra $L__BB0_13;
- mul.lo.s32 %r288, %r24, %r215;
+ mul.lo.s32 %r288, %r23, %r214;
mul.wide.s32 %rd73, %r288, 4;
add.s64 %rd74, %rd16, %rd73;
ld.global.f32 %f623, [%rd74];
$L__BB0_13:
@@ -465,11 +465,11 @@
mov.f32 %f641, %f640;
$L__BB0_16:
- shl.b32 %r583, %r583, 2;
+ shl.b32 %r577, %r577, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p24 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
@@ -480,29 +480,29 @@
$L__BB0_18:
setp.lt.s32 %p25, %r16, 4;
bar.sync 0;
@%p25 bra $L__BB0_23;
- mov.u32 %r581, %r17;
+ mov.u32 %r575, %r17;
$L__BB0_20:
- setp.ge.u32 %p26, %r5, %r581;
+ setp.ge.u32 %p26, %r5, %r575;
@%p26 bra $L__BB0_22;
- add.s32 %r317, %r581, %r15;
+ add.s32 %r317, %r575, %r15;
mul.wide.s32 %rd75, %r317, 4;
add.s64 %rd77, %rd43, %rd75;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd77];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
- shr.u32 %r36, %r581, 1;
- setp.gt.u32 %p27, %r581, 3;
- mov.u32 %r581, %r36;
+ shr.u32 %r35, %r575, 1;
+ setp.gt.u32 %p27, %r575, 3;
+ mov.u32 %r575, %r35;
@%p27 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p10 bra $L__BB0_26;
@@ -529,29 +529,29 @@
$L__BB0_28:
setp.lt.s32 %p203, %r16, 4;
bar.sync 0;
@%p203 bra $L__BB0_33;
- mov.u32 %r582, %r17;
+ mov.u32 %r576, %r17;
$L__BB0_30:
- setp.ge.u32 %p32, %r5, %r582;
+ setp.ge.u32 %p32, %r5, %r576;
@%p32 bra $L__BB0_32;
- add.s32 %r318, %r582, %r15;
+ add.s32 %r318, %r576, %r15;
mul.wide.s32 %rd78, %r318, 4;
add.s64 %rd80, %rd43, %rd78;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd80];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
- shr.u32 %r38, %r582, 1;
- setp.gt.u32 %p33, %r582, 3;
- mov.u32 %r582, %r38;
+ shr.u32 %r37, %r576, 1;
+ setp.gt.u32 %p33, %r576, 3;
+ mov.u32 %r576, %r37;
@%p33 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p10 bra $L__BB0_36;
@@ -589,11 +589,10 @@
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
- mul.lo.s32 %r577, %r580, %r4;
ld.shared.v4.u32 {%r327, %r328, %r329, %r330}, [%rd9];
ld.shared.v4.u32 {%r335, %r336, %r337, %r338}, [%rd10];
ld.shared.v4.u32 {%r343, %r344, %r345, %r346}, [%rd12];
mov.b32 {%rs97, %rs101}, %r335;
@@ -801,13 +800,12 @@
{ cvt.rn.bf16.f32 %rs124, %f365;}
mov.b32 %r326, {%rs124, %rs128};
- add.s32 %r351, %r13, %r577;
- mad.lo.s32 %r352, %r351, %r202, %r18;
- mul.wide.s32 %rd82, %r352, 2;
+ mad.lo.s32 %r351, %r23, %r201, %r8;
+ mul.wide.s32 %rd82, %r351, 2;
add.s64 %rd81, %rd36, %rd82;
st.global.cs.v4.s32 [%rd81], {%r323,%r324,%r325,%r326};
bra.uni $L__BB0_43;
@@ -817,12 +815,12 @@
{ cvt.rn.bf16.f32 %rs61, %f301;}
$L__BB0_43:
- add.s32 %r580, %r580, 1;
- setp.lt.s32 %p39, %r580, %r12;
+ add.s32 %r574, %r574, 1;
+ setp.lt.s32 %p39, %r574, %r12;
@%p39 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
@@ -841,68 +839,68 @@
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
- mov.u32 %r353, %tid.z;
- mad.lo.s32 %r354, %r4, %r353, %r9;
- mad.lo.s32 %r50, %r354, %r3, %r5;
- mul.wide.u32 %rd83, %r50, 4;
+ mov.u32 %r352, %tid.z;
+ mad.lo.s32 %r353, %r4, %r352, %r9;
+ mad.lo.s32 %r49, %r353, %r3, %r5;
+ mul.wide.u32 %rd83, %r49, 4;
add.s64 %rd23, %rd43, %rd83;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
- clz.b32 %r355, %r4;
- mov.u32 %r356, 31;
- sub.s32 %r51, %r356, %r355;
- mov.u32 %r357, 1;
- shl.b32 %r614, %r357, %r51;
- setp.lt.u32 %p40, %r9, %r614;
- add.s32 %r358, %r614, %r9;
- setp.lt.u32 %p41, %r358, %r4;
+ clz.b32 %r354, %r4;
+ mov.u32 %r355, 31;
+ sub.s32 %r50, %r355, %r354;
+ mov.u32 %r356, 1;
+ shl.b32 %r608, %r356, %r50;
+ setp.lt.u32 %p40, %r9, %r608;
+ add.s32 %r357, %r608, %r9;
+ setp.lt.u32 %p41, %r357, %r4;
and.pred %p4, %p40, %p41;
not.pred %p42, %p4;
@%p42 bra $L__BB0_46;
- shl.b32 %r359, %r3, %r51;
- add.s32 %r360, %r50, %r359;
- mul.wide.s32 %rd85, %r360, 4;
+ shl.b32 %r358, %r3, %r50;
+ add.s32 %r359, %r49, %r358;
+ mul.wide.s32 %rd85, %r359, 4;
add.s64 %rd87, %rd43, %rd85;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd87];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
- setp.lt.s32 %p43, %r614, 4;
+ setp.lt.s32 %p43, %r608, 4;
@%p43 bra $L__BB0_51;
- mov.u32 %r584, %r614;
+ mov.u32 %r578, %r608;
$L__BB0_48:
- shr.u32 %r54, %r584, 1;
- setp.ge.u32 %p44, %r9, %r54;
+ shr.u32 %r53, %r578, 1;
+ setp.ge.u32 %p44, %r9, %r53;
@%p44 bra $L__BB0_50;
- mad.lo.s32 %r361, %r54, %r3, %r50;
- mul.wide.s32 %rd88, %r361, 4;
+ mad.lo.s32 %r360, %r53, %r3, %r49;
+ mul.wide.s32 %rd88, %r360, 4;
add.s64 %rd90, %rd43, %rd88;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd90];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
- setp.gt.u32 %p45, %r584, 7;
- mov.u32 %r584, %r54;
+ setp.gt.u32 %p45, %r578, 7;
+ mov.u32 %r578, %r53;
@%p45 bra $L__BB0_48;
$L__BB0_51:
- mov.u32 %r585, 0;
- add.s32 %r363, %r50, %r3;
- mul.wide.u32 %rd91, %r363, 4;
+ mov.u32 %r579, 0;
+ add.s32 %r362, %r49, %r3;
+ mul.wide.u32 %rd91, %r362, 4;
add.s64 %rd24, %rd43, %rd91;
@%p14 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
@@ -911,54 +909,54 @@
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
- mov.b32 %r585, %f660;
+ mov.b32 %r579, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p42 bra $L__BB0_57;
- shl.b32 %r364, %r3, %r51;
- add.s32 %r365, %r50, %r364;
- mul.wide.s32 %rd93, %r365, 4;
+ shl.b32 %r363, %r3, %r50;
+ add.s32 %r364, %r49, %r363;
+ mul.wide.s32 %rd93, %r364, 4;
add.s64 %rd95, %rd43, %rd93;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd95];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p43 bra $L__BB0_62;
- mov.u32 %r586, %r614;
+ mov.u32 %r580, %r608;
$L__BB0_59:
- shr.u32 %r58, %r586, 1;
- setp.ge.u32 %p50, %r9, %r58;
+ shr.u32 %r57, %r580, 1;
+ setp.ge.u32 %p50, %r9, %r57;
@%p50 bra $L__BB0_61;
- mad.lo.s32 %r366, %r58, %r3, %r50;
- mul.wide.s32 %rd96, %r366, 4;
+ mad.lo.s32 %r365, %r57, %r3, %r49;
+ mul.wide.s32 %rd96, %r365, 4;
add.s64 %rd98, %rd43, %rd96;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd98];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
- setp.gt.u32 %p51, %r586, 7;
- mov.u32 %r586, %r58;
+ setp.gt.u32 %p51, %r580, 7;
+ mov.u32 %r580, %r57;
@%p51 bra $L__BB0_59;
$L__BB0_62:
- mov.u32 %r587, 0;
+ mov.u32 %r581, 0;
@%p14 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p53, %r4, 2;
@@ -966,54 +964,54 @@
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
- mov.b32 %r587, %f661;
+ mov.b32 %r581, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p42 bra $L__BB0_68;
- shl.b32 %r368, %r3, %r51;
- add.s32 %r369, %r50, %r368;
- mul.wide.s32 %rd99, %r369, 4;
+ shl.b32 %r367, %r3, %r50;
+ add.s32 %r368, %r49, %r367;
+ mul.wide.s32 %rd99, %r368, 4;
add.s64 %rd101, %rd43, %rd99;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd101];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p43 bra $L__BB0_73;
- mov.u32 %r588, %r614;
+ mov.u32 %r582, %r608;
$L__BB0_70:
- shr.u32 %r62, %r588, 1;
- setp.ge.u32 %p56, %r9, %r62;
+ shr.u32 %r61, %r582, 1;
+ setp.ge.u32 %p56, %r9, %r61;
@%p56 bra $L__BB0_72;
- mad.lo.s32 %r370, %r62, %r3, %r50;
- mul.wide.s32 %rd102, %r370, 4;
+ mad.lo.s32 %r369, %r61, %r3, %r49;
+ mul.wide.s32 %rd102, %r369, 4;
add.s64 %rd104, %rd43, %rd102;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd104];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
- setp.gt.u32 %p57, %r588, 7;
- mov.u32 %r588, %r62;
+ setp.gt.u32 %p57, %r582, 7;
+ mov.u32 %r582, %r61;
@%p57 bra $L__BB0_70;
$L__BB0_73:
- mov.u32 %r589, 0;
+ mov.u32 %r583, 0;
@%p14 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p59, %r4, 2;
@@ -1021,54 +1019,54 @@
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
- mov.b32 %r589, %f662;
+ mov.b32 %r583, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p42 bra $L__BB0_79;
- shl.b32 %r372, %r3, %r51;
- add.s32 %r373, %r50, %r372;
- mul.wide.s32 %rd105, %r373, 4;
+ shl.b32 %r371, %r3, %r50;
+ add.s32 %r372, %r49, %r371;
+ mul.wide.s32 %rd105, %r372, 4;
add.s64 %rd107, %rd43, %rd105;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd107];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p43 bra $L__BB0_84;
- mov.u32 %r590, %r614;
+ mov.u32 %r584, %r608;
$L__BB0_81:
- shr.u32 %r66, %r590, 1;
- setp.ge.u32 %p62, %r9, %r66;
+ shr.u32 %r65, %r584, 1;
+ setp.ge.u32 %p62, %r9, %r65;
@%p62 bra $L__BB0_83;
- mad.lo.s32 %r374, %r66, %r3, %r50;
- mul.wide.s32 %rd108, %r374, 4;
+ mad.lo.s32 %r373, %r65, %r3, %r49;
+ mul.wide.s32 %rd108, %r373, 4;
add.s64 %rd110, %rd43, %rd108;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd110];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
- setp.gt.u32 %p63, %r590, 7;
- mov.u32 %r590, %r66;
+ setp.gt.u32 %p63, %r584, 7;
+ mov.u32 %r584, %r65;
@%p63 bra $L__BB0_81;
$L__BB0_84:
- mov.u32 %r591, 0;
+ mov.u32 %r585, 0;
@%p14 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p65, %r4, 2;
@@ -1076,54 +1074,54 @@
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
- mov.b32 %r591, %f663;
+ mov.b32 %r585, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p42 bra $L__BB0_90;
- shl.b32 %r376, %r3, %r51;
- add.s32 %r377, %r50, %r376;
- mul.wide.s32 %rd111, %r377, 4;
+ shl.b32 %r375, %r3, %r50;
+ add.s32 %r376, %r49, %r375;
+ mul.wide.s32 %rd111, %r376, 4;
add.s64 %rd113, %rd43, %rd111;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd113];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p43 bra $L__BB0_95;
- mov.u32 %r592, %r614;
+ mov.u32 %r586, %r608;
$L__BB0_92:
- shr.u32 %r70, %r592, 1;
- setp.ge.u32 %p68, %r9, %r70;
+ shr.u32 %r69, %r586, 1;
+ setp.ge.u32 %p68, %r9, %r69;
@%p68 bra $L__BB0_94;
- mad.lo.s32 %r378, %r70, %r3, %r50;
- mul.wide.s32 %rd114, %r378, 4;
+ mad.lo.s32 %r377, %r69, %r3, %r49;
+ mul.wide.s32 %rd114, %r377, 4;
add.s64 %rd116, %rd43, %rd114;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd116];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
- setp.gt.u32 %p69, %r592, 7;
- mov.u32 %r592, %r70;
+ setp.gt.u32 %p69, %r586, 7;
+ mov.u32 %r586, %r69;
@%p69 bra $L__BB0_92;
$L__BB0_95:
- mov.u32 %r593, 0;
+ mov.u32 %r587, 0;
@%p14 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p71, %r4, 2;
@@ -1131,54 +1129,54 @@
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
- mov.b32 %r593, %f664;
+ mov.b32 %r587, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p42 bra $L__BB0_101;
- shl.b32 %r380, %r3, %r51;
- add.s32 %r381, %r50, %r380;
- mul.wide.s32 %rd117, %r381, 4;
+ shl.b32 %r379, %r3, %r50;
+ add.s32 %r380, %r49, %r379;
+ mul.wide.s32 %rd117, %r380, 4;
add.s64 %rd119, %rd43, %rd117;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd119];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p43 bra $L__BB0_106;
- mov.u32 %r594, %r614;
+ mov.u32 %r588, %r608;
$L__BB0_103:
- shr.u32 %r74, %r594, 1;
- setp.ge.u32 %p74, %r9, %r74;
+ shr.u32 %r73, %r588, 1;
+ setp.ge.u32 %p74, %r9, %r73;
@%p74 bra $L__BB0_105;
- mad.lo.s32 %r382, %r74, %r3, %r50;
- mul.wide.s32 %rd120, %r382, 4;
+ mad.lo.s32 %r381, %r73, %r3, %r49;
+ mul.wide.s32 %rd120, %r381, 4;
add.s64 %rd122, %rd43, %rd120;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd122];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
- setp.gt.u32 %p75, %r594, 7;
- mov.u32 %r594, %r74;
+ setp.gt.u32 %p75, %r588, 7;
+ mov.u32 %r588, %r73;
@%p75 bra $L__BB0_103;
$L__BB0_106:
- mov.u32 %r595, 0;
+ mov.u32 %r589, 0;
@%p14 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p77, %r4, 2;
@@ -1186,54 +1184,54 @@
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
- mov.b32 %r595, %f665;
+ mov.b32 %r589, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p42 bra $L__BB0_112;
- shl.b32 %r384, %r3, %r51;
- add.s32 %r385, %r50, %r384;
- mul.wide.s32 %rd123, %r385, 4;
+ shl.b32 %r383, %r3, %r50;
+ add.s32 %r384, %r49, %r383;
+ mul.wide.s32 %rd123, %r384, 4;
add.s64 %rd125, %rd43, %rd123;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd125];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p43 bra $L__BB0_117;
- mov.u32 %r596, %r614;
+ mov.u32 %r590, %r608;
$L__BB0_114:
- shr.u32 %r78, %r596, 1;
- setp.ge.u32 %p80, %r9, %r78;
+ shr.u32 %r77, %r590, 1;
+ setp.ge.u32 %p80, %r9, %r77;
@%p80 bra $L__BB0_116;
- mad.lo.s32 %r386, %r78, %r3, %r50;
- mul.wide.s32 %rd126, %r386, 4;
+ mad.lo.s32 %r385, %r77, %r3, %r49;
+ mul.wide.s32 %rd126, %r385, 4;
add.s64 %rd128, %rd43, %rd126;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd128];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
- setp.gt.u32 %p81, %r596, 7;
- mov.u32 %r596, %r78;
+ setp.gt.u32 %p81, %r590, 7;
+ mov.u32 %r590, %r77;
@%p81 bra $L__BB0_114;
$L__BB0_117:
- mov.u32 %r597, 0;
+ mov.u32 %r591, 0;
@%p14 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p83, %r4, 2;
@@ -1241,54 +1239,54 @@
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
- mov.b32 %r597, %f666;
+ mov.b32 %r591, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p42 bra $L__BB0_123;
- shl.b32 %r388, %r3, %r51;
- add.s32 %r389, %r50, %r388;
- mul.wide.s32 %rd129, %r389, 4;
+ shl.b32 %r387, %r3, %r50;
+ add.s32 %r388, %r49, %r387;
+ mul.wide.s32 %rd129, %r388, 4;
add.s64 %rd131, %rd43, %rd129;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd131];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p43 bra $L__BB0_128;
- mov.u32 %r598, %r614;
+ mov.u32 %r592, %r608;
$L__BB0_125:
- shr.u32 %r82, %r598, 1;
- setp.ge.u32 %p86, %r9, %r82;
+ shr.u32 %r81, %r592, 1;
+ setp.ge.u32 %p86, %r9, %r81;
@%p86 bra $L__BB0_127;
- mad.lo.s32 %r390, %r82, %r3, %r50;
- mul.wide.s32 %rd132, %r390, 4;
+ mad.lo.s32 %r389, %r81, %r3, %r49;
+ mul.wide.s32 %rd132, %r389, 4;
add.s64 %rd134, %rd43, %rd132;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd134];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
- setp.gt.u32 %p87, %r598, 7;
- mov.u32 %r598, %r82;
+ setp.gt.u32 %p87, %r592, 7;
+ mov.u32 %r592, %r81;
@%p87 bra $L__BB0_125;
$L__BB0_128:
- mov.u32 %r599, 0;
+ mov.u32 %r593, 0;
@%p14 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p89, %r4, 2;
@@ -1296,55 +1294,55 @@
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
- mov.b32 %r599, %f667;
+ mov.b32 %r593, %f667;
$L__BB0_132:
bar.sync 0;
- shl.b32 %r85, %r583, 4;
+ shl.b32 %r84, %r577, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p42 bra $L__BB0_134;
- shl.b32 %r392, %r3, %r51;
- add.s32 %r393, %r50, %r392;
- mul.wide.s32 %rd135, %r393, 4;
+ shl.b32 %r391, %r3, %r50;
+ add.s32 %r392, %r49, %r391;
+ mul.wide.s32 %rd135, %r392, 4;
add.s64 %rd137, %rd43, %rd135;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd137];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p43 bra $L__BB0_139;
- mov.u32 %r600, %r614;
+ mov.u32 %r594, %r608;
$L__BB0_136:
- shr.u32 %r87, %r600, 1;
- setp.ge.u32 %p92, %r9, %r87;
+ shr.u32 %r86, %r594, 1;
+ setp.ge.u32 %p92, %r9, %r86;
@%p92 bra $L__BB0_138;
- mad.lo.s32 %r394, %r87, %r3, %r50;
- mul.wide.s32 %rd138, %r394, 4;
+ mad.lo.s32 %r393, %r86, %r3, %r49;
+ mul.wide.s32 %rd138, %r393, 4;
add.s64 %rd140, %rd43, %rd138;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd140];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
- setp.gt.u32 %p93, %r600, 7;
- mov.u32 %r600, %r87;
+ setp.gt.u32 %p93, %r594, 7;
+ mov.u32 %r594, %r86;
@%p93 bra $L__BB0_136;
$L__BB0_139:
- mov.u32 %r601, 0;
+ mov.u32 %r595, 0;
@%p14 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p95, %r4, 2;
@@ -1352,54 +1350,54 @@
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
- mov.b32 %r601, %f668;
+ mov.b32 %r595, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p42 bra $L__BB0_145;
- shl.b32 %r396, %r3, %r51;
- add.s32 %r397, %r50, %r396;
- mul.wide.s32 %rd141, %r397, 4;
+ shl.b32 %r395, %r3, %r50;
+ add.s32 %r396, %r49, %r395;
+ mul.wide.s32 %rd141, %r396, 4;
add.s64 %rd143, %rd43, %rd141;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd143];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p43 bra $L__BB0_150;
- mov.u32 %r602, %r614;
+ mov.u32 %r596, %r608;
$L__BB0_147:
- shr.u32 %r91, %r602, 1;
- setp.ge.u32 %p98, %r9, %r91;
+ shr.u32 %r90, %r596, 1;
+ setp.ge.u32 %p98, %r9, %r90;
@%p98 bra $L__BB0_149;
- mad.lo.s32 %r398, %r91, %r3, %r50;
- mul.wide.s32 %rd144, %r398, 4;
+ mad.lo.s32 %r397, %r90, %r3, %r49;
+ mul.wide.s32 %rd144, %r397, 4;
add.s64 %rd146, %rd43, %rd144;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd146];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
- setp.gt.u32 %p99, %r602, 7;
- mov.u32 %r602, %r91;
+ setp.gt.u32 %p99, %r596, 7;
+ mov.u32 %r596, %r90;
@%p99 bra $L__BB0_147;
$L__BB0_150:
- mov.u32 %r603, 0;
+ mov.u32 %r597, 0;
@%p14 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p101, %r4, 2;
@@ -1407,54 +1405,54 @@
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
- mov.b32 %r603, %f669;
+ mov.b32 %r597, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p42 bra $L__BB0_156;
- shl.b32 %r400, %r3, %r51;
- add.s32 %r401, %r50, %r400;
- mul.wide.s32 %rd147, %r401, 4;
+ shl.b32 %r399, %r3, %r50;
+ add.s32 %r400, %r49, %r399;
+ mul.wide.s32 %rd147, %r400, 4;
add.s64 %rd149, %rd43, %rd147;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd149];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p43 bra $L__BB0_161;
- mov.u32 %r604, %r614;
+ mov.u32 %r598, %r608;
$L__BB0_158:
- shr.u32 %r95, %r604, 1;
- setp.ge.u32 %p104, %r9, %r95;
+ shr.u32 %r94, %r598, 1;
+ setp.ge.u32 %p104, %r9, %r94;
@%p104 bra $L__BB0_160;
- mad.lo.s32 %r402, %r95, %r3, %r50;
- mul.wide.s32 %rd150, %r402, 4;
+ mad.lo.s32 %r401, %r94, %r3, %r49;
+ mul.wide.s32 %rd150, %r401, 4;
add.s64 %rd152, %rd43, %rd150;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd152];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
- setp.gt.u32 %p105, %r604, 7;
- mov.u32 %r604, %r95;
+ setp.gt.u32 %p105, %r598, 7;
+ mov.u32 %r598, %r94;
@%p105 bra $L__BB0_158;
$L__BB0_161:
- mov.u32 %r605, 0;
+ mov.u32 %r599, 0;
@%p14 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p107, %r4, 2;
@@ -1462,54 +1460,54 @@
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
- mov.b32 %r605, %f670;
+ mov.b32 %r599, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p42 bra $L__BB0_167;
- shl.b32 %r404, %r3, %r51;
- add.s32 %r405, %r50, %r404;
- mul.wide.s32 %rd153, %r405, 4;
+ shl.b32 %r403, %r3, %r50;
+ add.s32 %r404, %r49, %r403;
+ mul.wide.s32 %rd153, %r404, 4;
add.s64 %rd155, %rd43, %rd153;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd155];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p43 bra $L__BB0_172;
- mov.u32 %r606, %r614;
+ mov.u32 %r600, %r608;
$L__BB0_169:
- shr.u32 %r99, %r606, 1;
- setp.ge.u32 %p110, %r9, %r99;
+ shr.u32 %r98, %r600, 1;
+ setp.ge.u32 %p110, %r9, %r98;
@%p110 bra $L__BB0_171;
- mad.lo.s32 %r406, %r99, %r3, %r50;
- mul.wide.s32 %rd156, %r406, 4;
+ mad.lo.s32 %r405, %r98, %r3, %r49;
+ mul.wide.s32 %rd156, %r405, 4;
add.s64 %rd158, %rd43, %rd156;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd158];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
- setp.gt.u32 %p111, %r606, 7;
- mov.u32 %r606, %r99;
+ setp.gt.u32 %p111, %r600, 7;
+ mov.u32 %r600, %r98;
@%p111 bra $L__BB0_169;
$L__BB0_172:
- mov.u32 %r607, 0;
+ mov.u32 %r601, 0;
@%p14 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p113, %r4, 2;
@@ -1517,54 +1515,54 @@
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
- mov.b32 %r607, %f671;
+ mov.b32 %r601, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p42 bra $L__BB0_178;
- shl.b32 %r408, %r3, %r51;
- add.s32 %r409, %r50, %r408;
- mul.wide.s32 %rd159, %r409, 4;
+ shl.b32 %r407, %r3, %r50;
+ add.s32 %r408, %r49, %r407;
+ mul.wide.s32 %rd159, %r408, 4;
add.s64 %rd161, %rd43, %rd159;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd161];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p43 bra $L__BB0_183;
- mov.u32 %r608, %r614;
+ mov.u32 %r602, %r608;
$L__BB0_180:
- shr.u32 %r103, %r608, 1;
- setp.ge.u32 %p116, %r9, %r103;
+ shr.u32 %r102, %r602, 1;
+ setp.ge.u32 %p116, %r9, %r102;
@%p116 bra $L__BB0_182;
- mad.lo.s32 %r410, %r103, %r3, %r50;
- mul.wide.s32 %rd162, %r410, 4;
+ mad.lo.s32 %r409, %r102, %r3, %r49;
+ mul.wide.s32 %rd162, %r409, 4;
add.s64 %rd164, %rd43, %rd162;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd164];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
- setp.gt.u32 %p117, %r608, 7;
- mov.u32 %r608, %r103;
+ setp.gt.u32 %p117, %r602, 7;
+ mov.u32 %r602, %r102;
@%p117 bra $L__BB0_180;
$L__BB0_183:
- mov.u32 %r609, 0;
+ mov.u32 %r603, 0;
@%p14 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p119, %r4, 2;
@@ -1572,54 +1570,54 @@
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
- mov.b32 %r609, %f672;
+ mov.b32 %r603, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p42 bra $L__BB0_189;
- shl.b32 %r412, %r3, %r51;
- add.s32 %r413, %r50, %r412;
- mul.wide.s32 %rd165, %r413, 4;
+ shl.b32 %r411, %r3, %r50;
+ add.s32 %r412, %r49, %r411;
+ mul.wide.s32 %rd165, %r412, 4;
add.s64 %rd167, %rd43, %rd165;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd167];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p43 bra $L__BB0_194;
- mov.u32 %r610, %r614;
+ mov.u32 %r604, %r608;
$L__BB0_191:
- shr.u32 %r107, %r610, 1;
- setp.ge.u32 %p122, %r9, %r107;
+ shr.u32 %r106, %r604, 1;
+ setp.ge.u32 %p122, %r9, %r106;
@%p122 bra $L__BB0_193;
- mad.lo.s32 %r414, %r107, %r3, %r50;
- mul.wide.s32 %rd168, %r414, 4;
+ mad.lo.s32 %r413, %r106, %r3, %r49;
+ mul.wide.s32 %rd168, %r413, 4;
add.s64 %rd170, %rd43, %rd168;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd170];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
- setp.gt.u32 %p123, %r610, 7;
- mov.u32 %r610, %r107;
+ setp.gt.u32 %p123, %r604, 7;
+ mov.u32 %r604, %r106;
@%p123 bra $L__BB0_191;
$L__BB0_194:
- mov.u32 %r611, 0;
+ mov.u32 %r605, 0;
@%p14 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p125, %r4, 2;
@@ -1627,54 +1625,54 @@
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
- mov.b32 %r611, %f673;
+ mov.b32 %r605, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p42 bra $L__BB0_200;
- shl.b32 %r416, %r3, %r51;
- add.s32 %r417, %r50, %r416;
- mul.wide.s32 %rd171, %r417, 4;
+ shl.b32 %r415, %r3, %r50;
+ add.s32 %r416, %r49, %r415;
+ mul.wide.s32 %rd171, %r416, 4;
add.s64 %rd173, %rd43, %rd171;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd173];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p43 bra $L__BB0_205;
- mov.u32 %r612, %r614;
+ mov.u32 %r606, %r608;
$L__BB0_202:
- shr.u32 %r111, %r612, 1;
- setp.ge.u32 %p128, %r9, %r111;
+ shr.u32 %r110, %r606, 1;
+ setp.ge.u32 %p128, %r9, %r110;
@%p128 bra $L__BB0_204;
- mad.lo.s32 %r418, %r111, %r3, %r50;
- mul.wide.s32 %rd174, %r418, 4;
+ mad.lo.s32 %r417, %r110, %r3, %r49;
+ mul.wide.s32 %rd174, %r417, 4;
add.s64 %rd176, %rd43, %rd174;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd176];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
- setp.gt.u32 %p129, %r612, 7;
- mov.u32 %r612, %r111;
+ setp.gt.u32 %p129, %r606, 7;
+ mov.u32 %r606, %r110;
@%p129 bra $L__BB0_202;
$L__BB0_205:
- mov.u32 %r613, 0;
+ mov.u32 %r607, 0;
@%p14 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p131, %r4, 2;
@@ -1682,21 +1680,21 @@
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
- mov.b32 %r613, %f674;
+ mov.b32 %r607, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p42 bra $L__BB0_211;
- shl.b32 %r420, %r3, %r51;
- add.s32 %r421, %r50, %r420;
- mul.wide.s32 %rd177, %r421, 4;
+ shl.b32 %r419, %r3, %r50;
+ add.s32 %r420, %r49, %r419;
+ mul.wide.s32 %rd177, %r420, 4;
add.s64 %rd179, %rd43, %rd177;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd179];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
@@ -1704,30 +1702,30 @@
$L__BB0_211:
bar.sync 0;
@%p43 bra $L__BB0_215;
$L__BB0_212:
- shr.u32 %r115, %r614, 1;
- setp.ge.u32 %p134, %r9, %r115;
+ shr.u32 %r114, %r608, 1;
+ setp.ge.u32 %p134, %r9, %r114;
@%p134 bra $L__BB0_214;
- mad.lo.s32 %r422, %r115, %r3, %r50;
- mul.wide.s32 %rd180, %r422, 4;
+ mad.lo.s32 %r421, %r114, %r3, %r49;
+ mul.wide.s32 %rd180, %r421, 4;
add.s64 %rd182, %rd43, %rd180;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd182];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
- setp.gt.u32 %p135, %r614, 7;
- mov.u32 %r614, %r115;
+ setp.gt.u32 %p135, %r608, 7;
+ mov.u32 %r608, %r114;
@%p135 bra $L__BB0_212;
$L__BB0_215:
- mov.u32 %r615, 0;
+ mov.u32 %r609, 0;
@%p14 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p137, %r4, 2;
@@ -1735,255 +1733,251 @@
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
- mov.b32 %r615, %f675;
+ mov.b32 %r609, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p14 bra $L__BB0_226;
- shl.b32 %r573, %r5, 3;
- mov.u32 %r448, %ctaid.y;
- mad.lo.s32 %r449, %r202, %r448, %r573;
- add.s32 %r450, %r449, %r85;
- mul.wide.s32 %rd189, %r450, 4;
+ mov.u32 %r447, %ctaid.y;
+ mad.lo.s32 %r448, %r201, %r447, %r8;
+ add.s32 %r449, %r448, %r84;
+ mul.wide.s32 %rd189, %r449, 4;
add.s64 %rd187, %rd39, %rd189;
- st.volatile.global.v4.s32 [%rd187], {%r585,%r587,%r589,%r591};
-
- add.s32 %r451, %r450, 4;
- mul.wide.s32 %rd190, %r451, 4;
+ st.volatile.global.v4.s32 [%rd187], {%r579,%r581,%r583,%r585};
+
+ add.s32 %r450, %r449, 4;
+ mul.wide.s32 %rd190, %r450, 4;
add.s64 %rd188, %rd39, %rd190;
- st.volatile.global.v4.s32 [%rd188], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd188], {%r587,%r589,%r591,%r593};
bra.uni $L__BB0_226;
$L__BB0_220:
- shl.b32 %r570, %r5, 3;
setp.eq.s32 %p138, %r9, 0;
and.pred %p5, %p138, %p12;
not.pred %p140, %p5;
- add.s32 %r424, %r570, 3;
- sub.s32 %r118, %r424, %r202;
- mov.u32 %r425, %ctaid.y;
- mad.lo.s32 %r119, %r202, %r425, %r570;
- neg.s32 %r426, %r85;
- setp.ge.s32 %p141, %r118, %r426;
+ add.s32 %r423, %r8, 3;
+ sub.s32 %r117, %r423, %r201;
+ mov.u32 %r424, %ctaid.y;
+ mad.lo.s32 %r118, %r201, %r424, %r8;
+ neg.s32 %r425, %r84;
+ setp.ge.s32 %p141, %r117, %r425;
or.pred %p142, %p140, %p141;
@%p142 bra $L__BB0_222;
- add.s32 %r431, %r119, %r85;
- mul.wide.s32 %rd184, %r431, 4;
+ add.s32 %r430, %r118, %r84;
+ mul.wide.s32 %rd184, %r430, 4;
add.s64 %rd183, %rd39, %rd184;
- st.volatile.global.v4.s32 [%rd183], {%r585,%r587,%r589,%r591};
+ st.volatile.global.v4.s32 [%rd183], {%r579,%r581,%r583,%r585};
$L__BB0_222:
- mov.u32 %r432, -4;
- sub.s32 %r433, %r432, %r85;
- setp.ge.s32 %p143, %r118, %r433;
+ mov.u32 %r431, -4;
+ sub.s32 %r432, %r431, %r84;
+ setp.ge.s32 %p143, %r117, %r432;
or.pred %p145, %p140, %p143;
@%p145 bra $L__BB0_226;
- add.s32 %r438, %r119, %r85;
- add.s32 %r439, %r438, 4;
- mul.wide.s32 %rd186, %r439, 4;
+ add.s32 %r437, %r118, %r84;
+ add.s32 %r438, %r437, 4;
+ mul.wide.s32 %rd186, %r438, 4;
add.s64 %rd185, %rd39, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r593,%r595,%r597,%r599};
+ st.volatile.global.v4.s32 [%rd185], {%r587,%r589,%r591,%r593};
$L__BB0_226:
- shl.b32 %r120, %r583, 5;
+ shl.b32 %r119, %r577, 5;
@%p1 bra $L__BB0_231;
bra.uni $L__BB0_227;
$L__BB0_231:
@%p14 bra $L__BB0_233;
- shl.b32 %r572, %r5, 3;
- mov.u32 %r476, %ctaid.y;
- mad.lo.s32 %r477, %r202, %r476, %r572;
- add.s32 %r478, %r477, %r120;
- mul.wide.s32 %rd197, %r478, 4;
+ mov.u32 %r475, %ctaid.y;
+ mad.lo.s32 %r476, %r201, %r475, %r8;
+ add.s32 %r477, %r476, %r119;
+ mul.wide.s32 %rd197, %r477, 4;
add.s64 %rd195, %rd40, %rd197;
- st.volatile.global.v4.s32 [%rd195], {%r601,%r603,%r605,%r607};
-
- add.s32 %r479, %r478, 4;
- mul.wide.s32 %rd198, %r479, 4;
+ st.volatile.global.v4.s32 [%rd195], {%r595,%r597,%r599,%r601};
+
+ add.s32 %r478, %r477, 4;
+ mul.wide.s32 %rd198, %r478, 4;
add.s64 %rd196, %rd40, %rd198;
- st.volatile.global.v4.s32 [%rd196], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd196], {%r603,%r605,%r607,%r609};
bra.uni $L__BB0_233;
$L__BB0_227:
- shl.b32 %r571, %r5, 3;
setp.eq.s32 %p147, %r9, 0;
and.pred %p6, %p147, %p12;
not.pred %p149, %p6;
- add.s32 %r452, %r571, 3;
- sub.s32 %r121, %r452, %r202;
- mov.u32 %r453, %ctaid.y;
- mad.lo.s32 %r122, %r202, %r453, %r571;
- neg.s32 %r454, %r120;
- setp.ge.s32 %p150, %r121, %r454;
+ add.s32 %r451, %r8, 3;
+ sub.s32 %r120, %r451, %r201;
+ mov.u32 %r452, %ctaid.y;
+ mad.lo.s32 %r121, %r201, %r452, %r8;
+ neg.s32 %r453, %r119;
+ setp.ge.s32 %p150, %r120, %r453;
or.pred %p151, %p149, %p150;
@%p151 bra $L__BB0_229;
- add.s32 %r459, %r122, %r120;
- mul.wide.s32 %rd192, %r459, 4;
+ add.s32 %r458, %r121, %r119;
+ mul.wide.s32 %rd192, %r458, 4;
add.s64 %rd191, %rd40, %rd192;
- st.volatile.global.v4.s32 [%rd191], {%r601,%r603,%r605,%r607};
+ st.volatile.global.v4.s32 [%rd191], {%r595,%r597,%r599,%r601};
$L__BB0_229:
- mov.u32 %r460, -4;
- sub.s32 %r461, %r460, %r120;
- setp.ge.s32 %p152, %r121, %r461;
+ mov.u32 %r459, -4;
+ sub.s32 %r460, %r459, %r119;
+ setp.ge.s32 %p152, %r120, %r460;
or.pred %p154, %p149, %p152;
@%p154 bra $L__BB0_233;
- add.s32 %r466, %r122, %r120;
- add.s32 %r467, %r466, 4;
- mul.wide.s32 %rd194, %r467, 4;
+ add.s32 %r465, %r121, %r119;
+ add.s32 %r466, %r465, 4;
+ mul.wide.s32 %rd194, %r466, 4;
add.s64 %rd193, %rd40, %rd194;
- st.volatile.global.v4.s32 [%rd193], {%r609,%r611,%r613,%r615};
+ st.volatile.global.v4.s32 [%rd193], {%r603,%r605,%r607,%r609};
$L__BB0_233:
- mov.u32 %r123, %ctaid.y;
+ mov.u32 %r122, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r480, %r5, %r9;
- or.b32 %r482, %r480, %r353;
- setp.ne.s32 %p156, %r482, 0;
+ or.b32 %r479, %r5, %r9;
+ or.b32 %r481, %r479, %r352;
+ setp.ne.s32 %p156, %r481, 0;
@%p156 bra $L__BB0_237;
ld.param.u64 %rd237, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd199, %rd237;
- mov.u32 %r483, %ctaid.x;
- mov.u32 %r484, %ctaid.z;
- mov.u32 %r485, %nctaid.x;
- mad.lo.s32 %r486, %r484, %r485, %r483;
- mul.wide.s32 %rd200, %r486, 8;
+ mov.u32 %r482, %ctaid.x;
+ mov.u32 %r483, %ctaid.z;
+ mov.u32 %r484, %nctaid.x;
+ mad.lo.s32 %r485, %r483, %r484, %r482;
+ mul.wide.s32 %rd200, %r485, 8;
add.s64 %rd27, %rd199, %rd200;
- add.s32 %r487, %r11, -1;
- setp.eq.s32 %p157, %r123, %r487;
+ add.s32 %r486, %r11, -1;
+ setp.eq.s32 %p157, %r122, %r486;
cvt.s64.s32 %rd201, %r11;
mov.u64 %rd202, -9223372036854775807;
sub.s64 %rd203, %rd202, %rd201;
selp.b64 %rd204, %rd203, 1, %p157;
atom.global.add.u64 %rd28, [%rd27], %rd204;
ld.volatile.global.u64 %rd205, [%rd27];
xor.b64 %rd206, %rd205, %rd28;
setp.lt.s64 %p158, %rd206, 0;
@%p158 bra $L__BB0_237;
- mov.u32 %r616, 8;
+ mov.u32 %r610, 8;
$L__BB0_236:
- nanosleep.u32 %r616;
-
- setp.lt.u32 %p159, %r616, 256;
- selp.u32 %r490, 1, 0, %p159;
- shl.b32 %r616, %r616, %r490;
+ nanosleep.u32 %r610;
+
+ setp.lt.u32 %p159, %r610, 256;
+ selp.u32 %r489, 1, 0, %p159;
+ shl.b32 %r610, %r610, %r489;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.gt.s64 %p160, %rd208, -1;
@%p160 bra $L__BB0_236;
$L__BB0_237:
bar.sync 0;
- add.s32 %r491, %r11, %r3;
- add.s32 %r492, %r491, -1;
- div.s32 %r126, %r492, %r3;
- setp.lt.s32 %p161, %r126, 1;
+ add.s32 %r490, %r11, %r3;
+ add.s32 %r491, %r490, -1;
+ div.s32 %r125, %r491, %r3;
+ setp.lt.s32 %p161, %r125, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_243;
- add.s32 %r494, %r202, 1;
- shr.u32 %r495, %r494, 31;
- add.s32 %r496, %r494, %r495;
- shr.s32 %r497, %r496, 1;
- add.s32 %r498, %r4, %r497;
- add.s32 %r499, %r498, -1;
- shl.b32 %r500, %r9, 1;
- shl.b32 %r501, %r4, 1;
- mad.lo.s32 %r502, %r501, %r123, %r500;
- or.b32 %r503, %r502, 1;
- setp.ge.s32 %p162, %r503, %r202;
- div.s32 %r504, %r499, %r4;
- setp.ge.s32 %p163, %r123, %r504;
+ add.s32 %r493, %r201, 1;
+ shr.u32 %r494, %r493, 31;
+ add.s32 %r495, %r493, %r494;
+ shr.s32 %r496, %r495, 1;
+ add.s32 %r497, %r4, %r496;
+ add.s32 %r498, %r497, -1;
+ shl.b32 %r499, %r9, 1;
+ shl.b32 %r500, %r4, 1;
+ mad.lo.s32 %r501, %r500, %r122, %r499;
+ or.b32 %r502, %r501, 1;
+ setp.ge.s32 %p162, %r502, %r201;
+ div.s32 %r503, %r498, %r4;
+ setp.ge.s32 %p163, %r122, %r503;
or.pred %p7, %p163, %p162;
- mul.lo.s32 %r505, %r4, %r123;
- shl.b32 %r506, %r505, 1;
- mad.lo.s32 %r507, %r202, %r5, %r506;
- add.s32 %r618, %r507, %r500;
- mul.lo.s32 %r128, %r202, %r3;
- mov.u32 %r493, 0;
+ mul.lo.s32 %r504, %r4, %r122;
+ shl.b32 %r505, %r504, 1;
+ mad.lo.s32 %r506, %r201, %r5, %r505;
+ add.s32 %r612, %r506, %r499;
+ mul.lo.s32 %r127, %r201, %r3;
+ mov.u32 %r492, 0;
mov.f32 %f678, 0f00000000;
- mov.u32 %r617, %r5;
- mov.u32 %r619, %r493;
+ mov.u32 %r611, %r5;
+ mov.u32 %r613, %r492;
$L__BB0_239:
.pragma "nounroll";
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p7 bra $L__BB0_242;
- setp.ge.s32 %p164, %r617, %r11;
- mov.u32 %r620, %r493;
- mov.u32 %r621, %r493;
+ setp.ge.s32 %p164, %r611, %r11;
+ mov.u32 %r614, %r492;
+ mov.u32 %r615, %r492;
@%p164 bra $L__BB0_242;
- mul.wide.s32 %rd210, %r618, 4;
+ mul.wide.s32 %rd210, %r612, 4;
add.s64 %rd209, %rd39, %rd210;
- ld.volatile.global.v2.s32 {%r621,%r620}, [%rd209];
+ ld.volatile.global.v2.s32 {%r615,%r614}, [%rd209];
$L__BB0_242:
- mov.b32 %f558, %r621;
+ mov.b32 %f558, %r615;
add.f32 %f679, %f679, %f558;
- mov.b32 %f559, %r620;
+ mov.b32 %f559, %r614;
add.f32 %f678, %f678, %f559;
- add.s32 %r618, %r618, %r128;
- add.s32 %r617, %r617, %r3;
- add.s32 %r619, %r619, 1;
- setp.lt.s32 %p165, %r619, %r126;
+ add.s32 %r612, %r612, %r127;
+ add.s32 %r611, %r611, %r3;
+ add.s32 %r613, %r613, 1;
+ setp.lt.s32 %p165, %r613, %r125;
@%p165 bra $L__BB0_239;
$L__BB0_243:
- clz.b32 %r514, %r3;
- mov.u32 %r515, 31;
- sub.s32 %r516, %r515, %r514;
- mov.u32 %r517, 1;
- shl.b32 %r139, %r517, %r516;
- setp.lt.u32 %p166, %r5, %r139;
- add.s32 %r518, %r139, %r5;
- setp.lt.u32 %p167, %r518, %r3;
+ clz.b32 %r513, %r3;
+ mov.u32 %r514, 31;
+ sub.s32 %r515, %r514, %r513;
+ mov.u32 %r516, 1;
+ shl.b32 %r138, %r516, %r515;
+ setp.lt.u32 %p166, %r5, %r138;
+ add.s32 %r517, %r138, %r5;
+ setp.lt.u32 %p167, %r517, %r3;
and.pred %p8, %p166, %p167;
- add.s32 %r519, %r50, %r139;
- mul.wide.s32 %rd211, %r519, 4;
+ add.s32 %r518, %r49, %r138;
+ mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd43, %rd211;
- shr.u32 %r520, %r139, 31;
- add.s32 %r521, %r139, %r520;
- shr.s32 %r630, %r521, 1;
+ shr.u32 %r519, %r138, 31;
+ add.s32 %r520, %r138, %r519;
+ shr.s32 %r624, %r520, 1;
st.shared.f32 [%rd23], %f679;
bar.sync 0;
not.pred %p168, %p8;
@%p168 bra $L__BB0_245;
@@ -1991,38 +1985,38 @@
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_245:
- setp.lt.s32 %p169, %r139, 4;
+ setp.lt.s32 %p169, %r138, 4;
bar.sync 0;
@%p169 bra $L__BB0_250;
- mov.u32 %r622, %r630;
+ mov.u32 %r616, %r624;
$L__BB0_247:
- setp.ge.u32 %p170, %r5, %r622;
+ setp.ge.u32 %p170, %r5, %r616;
@%p170 bra $L__BB0_249;
- add.s32 %r522, %r622, %r50;
- mul.wide.s32 %rd213, %r522, 4;
+ add.s32 %r521, %r616, %r49;
+ mul.wide.s32 %rd213, %r521, 4;
add.s64 %rd215, %rd43, %rd213;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd215];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_249:
bar.sync 0;
- shr.u32 %r142, %r622, 1;
- setp.gt.u32 %p171, %r622, 3;
- mov.u32 %r622, %r142;
+ shr.u32 %r141, %r616, 1;
+ setp.gt.u32 %p171, %r616, 3;
+ mov.u32 %r616, %r141;
@%p171 bra $L__BB0_247;
$L__BB0_250:
- add.s32 %r523, %r50, 1;
- mul.wide.u32 %rd216, %r523, 4;
+ add.s32 %r522, %r49, 1;
+ mul.wide.u32 %rd216, %r522, 4;
add.s64 %rd30, %rd43, %rd216;
mov.f32 %f680, 0f00000000;
@%p10 bra $L__BB0_253;
setp.lt.u32 %p173, %r3, 2;
@@ -2050,29 +2044,29 @@
$L__BB0_255:
bar.sync 0;
@%p169 bra $L__BB0_260;
- mov.u32 %r623, %r630;
+ mov.u32 %r617, %r624;
$L__BB0_257:
- setp.ge.u32 %p176, %r5, %r623;
+ setp.ge.u32 %p176, %r5, %r617;
@%p176 bra $L__BB0_259;
- add.s32 %r524, %r623, %r50;
- mul.wide.s32 %rd218, %r524, 4;
+ add.s32 %r523, %r617, %r49;
+ mul.wide.s32 %rd218, %r523, 4;
add.s64 %rd220, %rd43, %rd218;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd220];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_259:
bar.sync 0;
- shr.u32 %r144, %r623, 1;
- setp.gt.u32 %p177, %r623, 3;
- mov.u32 %r623, %r144;
+ shr.u32 %r143, %r617, 1;
+ setp.gt.u32 %p177, %r617, 3;
+ mov.u32 %r617, %r143;
@%p177 bra $L__BB0_257;
$L__BB0_260:
mov.f32 %f681, 0f00000000;
@%p10 bra $L__BB0_263;
@@ -2091,90 +2085,90 @@
{ cvt.rn.bf16.f32 %rs130, %f681;}
@%p10 bra $L__BB0_267;
- add.s32 %r525, %r202, 1;
- shr.u32 %r526, %r525, 31;
- add.s32 %r527, %r525, %r526;
- shr.s32 %r528, %r527, 1;
- add.s32 %r529, %r4, %r528;
- add.s32 %r530, %r529, -1;
- div.s32 %r531, %r530, %r4;
- setp.ge.s32 %p181, %r123, %r531;
+ add.s32 %r524, %r201, 1;
+ shr.u32 %r525, %r524, 31;
+ add.s32 %r526, %r524, %r525;
+ shr.s32 %r527, %r526, 1;
+ add.s32 %r528, %r4, %r527;
+ add.s32 %r529, %r528, -1;
+ div.s32 %r530, %r529, %r4;
+ setp.ge.s32 %p181, %r122, %r530;
@%p181 bra $L__BB0_267;
- shl.b32 %r145, %r9, 1;
- mul.lo.s32 %r532, %r4, %r123;
- shl.b32 %r146, %r532, 1;
- add.s32 %r533, %r145, %r146;
- or.b32 %r534, %r533, 1;
- setp.ge.s32 %p182, %r534, %r202;
+ shl.b32 %r144, %r9, 1;
+ mul.lo.s32 %r531, %r4, %r122;
+ shl.b32 %r145, %r531, 1;
+ add.s32 %r532, %r144, %r145;
+ or.b32 %r533, %r532, 1;
+ setp.ge.s32 %p182, %r533, %r201;
@%p182 bra $L__BB0_267;
ld.param.u64 %rd236, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r535, %r146, %r145;
+ add.s32 %r534, %r145, %r144;
cvta.to.global.u64 %rd221, %rd236;
- mul.wide.s32 %rd222, %r535, 2;
+ mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd221, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_267:
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p161 bra $L__BB0_273;
- add.s32 %r537, %r202, 1;
- shr.u32 %r538, %r537, 31;
- add.s32 %r539, %r537, %r538;
- shr.s32 %r540, %r539, 1;
- add.s32 %r541, %r4, %r540;
- add.s32 %r542, %r541, -1;
- shl.b32 %r543, %r9, 1;
- shl.b32 %r544, %r4, 1;
- mad.lo.s32 %r545, %r544, %r123, %r543;
- or.b32 %r546, %r545, 1;
- setp.ge.s32 %p184, %r546, %r202;
- div.s32 %r547, %r542, %r4;
- setp.ge.s32 %p185, %r123, %r547;
+ add.s32 %r536, %r201, 1;
+ shr.u32 %r537, %r536, 31;
+ add.s32 %r538, %r536, %r537;
+ shr.s32 %r539, %r538, 1;
+ add.s32 %r540, %r4, %r539;
+ add.s32 %r541, %r540, -1;
+ shl.b32 %r542, %r9, 1;
+ shl.b32 %r543, %r4, 1;
+ mad.lo.s32 %r544, %r543, %r122, %r542;
+ or.b32 %r545, %r544, 1;
+ setp.ge.s32 %p184, %r545, %r201;
+ div.s32 %r546, %r541, %r4;
+ setp.ge.s32 %p185, %r122, %r546;
or.pred %p9, %p185, %p184;
- mul.lo.s32 %r548, %r4, %r123;
- shl.b32 %r549, %r548, 1;
- mad.lo.s32 %r550, %r202, %r5, %r549;
- add.s32 %r625, %r550, %r543;
- mul.lo.s32 %r148, %r202, %r3;
- mov.u32 %r536, 0;
+ mul.lo.s32 %r547, %r4, %r122;
+ shl.b32 %r548, %r547, 1;
+ mad.lo.s32 %r549, %r201, %r5, %r548;
+ add.s32 %r619, %r549, %r542;
+ mul.lo.s32 %r147, %r201, %r3;
+ mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
- mov.u32 %r624, %r5;
- mov.u32 %r626, %r536;
+ mov.u32 %r618, %r5;
+ mov.u32 %r620, %r535;
$L__BB0_269:
.pragma "nounroll";
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p9 bra $L__BB0_272;
- setp.ge.s32 %p186, %r624, %r11;
- mov.u32 %r627, %r536;
- mov.u32 %r628, %r536;
+ setp.ge.s32 %p186, %r618, %r11;
+ mov.u32 %r621, %r535;
+ mov.u32 %r622, %r535;
@%p186 bra $L__BB0_272;
- mul.wide.s32 %rd225, %r625, 4;
+ mul.wide.s32 %rd225, %r619, 4;
add.s64 %rd224, %rd40, %rd225;
- ld.volatile.global.v2.s32 {%r628,%r627}, [%rd224];
+ ld.volatile.global.v2.s32 {%r622,%r621}, [%rd224];
$L__BB0_272:
- mov.b32 %f584, %r628;
+ mov.b32 %f584, %r622;
add.f32 %f685, %f685, %f584;
- mov.b32 %f585, %r627;
+ mov.b32 %f585, %r621;
add.f32 %f684, %f684, %f585;
- add.s32 %r625, %r625, %r148;
- add.s32 %r624, %r624, %r3;
- add.s32 %r626, %r626, 1;
- setp.lt.s32 %p187, %r626, %r126;
+ add.s32 %r619, %r619, %r147;
+ add.s32 %r618, %r618, %r3;
+ add.s32 %r620, %r620, 1;
+ setp.lt.s32 %p187, %r620, %r125;
@%p187 bra $L__BB0_269;
$L__BB0_273:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@@ -2187,29 +2181,29 @@
$L__BB0_275:
bar.sync 0;
@%p169 bra $L__BB0_280;
- mov.u32 %r629, %r630;
+ mov.u32 %r623, %r624;
$L__BB0_277:
- setp.ge.u32 %p190, %r5, %r629;
+ setp.ge.u32 %p190, %r5, %r623;
@%p190 bra $L__BB0_279;
- add.s32 %r557, %r629, %r50;
- mul.wide.s32 %rd226, %r557, 4;
+ add.s32 %r556, %r623, %r49;
+ mul.wide.s32 %rd226, %r556, 4;
add.s64 %rd228, %rd43, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_279:
bar.sync 0;
- shr.u32 %r160, %r629, 1;
- setp.gt.u32 %p191, %r629, 3;
- mov.u32 %r629, %r160;
+ shr.u32 %r159, %r623, 1;
+ setp.gt.u32 %p191, %r623, 3;
+ mov.u32 %r623, %r159;
@%p191 bra $L__BB0_277;
$L__BB0_280:
mov.f32 %f686, 0f00000000;
@%p10 bra $L__BB0_283;
@@ -2240,26 +2234,26 @@
$L__BB0_285:
bar.sync 0;
@%p169 bra $L__BB0_289;
$L__BB0_286:
- setp.ge.u32 %p196, %r5, %r630;
+ setp.ge.u32 %p196, %r5, %r624;
@%p196 bra $L__BB0_288;
- add.s32 %r558, %r630, %r50;
- mul.wide.s32 %rd229, %r558, 4;
+ add.s32 %r557, %r624, %r49;
+ mul.wide.s32 %rd229, %r557, 4;
add.s64 %rd231, %rd43, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_288:
bar.sync 0;
- shr.u32 %r162, %r630, 1;
- setp.gt.u32 %p197, %r630, 3;
- mov.u32 %r630, %r162;
+ shr.u32 %r161, %r624, 1;
+ setp.gt.u32 %p197, %r624, 3;
+ mov.u32 %r624, %r161;
@%p197 bra $L__BB0_286;
$L__BB0_289:
mov.f32 %f687, 0f00000000;
@%p10 bra $L__BB0_292;
@@ -2278,32 +2272,32 @@
{ cvt.rn.bf16.f32 %rs132, %f687;}
@%p10 bra $L__BB0_296;
- add.s32 %r559, %r202, 1;
- shr.u32 %r560, %r559, 31;
- add.s32 %r561, %r559, %r560;
- shr.s32 %r562, %r561, 1;
- add.s32 %r563, %r4, %r562;
- add.s32 %r564, %r563, -1;
- div.s32 %r565, %r564, %r4;
- setp.ge.s32 %p201, %r123, %r565;
+ add.s32 %r558, %r201, 1;
+ shr.u32 %r559, %r558, 31;
+ add.s32 %r560, %r558, %r559;
+ shr.s32 %r561, %r560, 1;
+ add.s32 %r562, %r4, %r561;
+ add.s32 %r563, %r562, -1;
+ div.s32 %r564, %r563, %r4;
+ setp.ge.s32 %p201, %r122, %r564;
@%p201 bra $L__BB0_296;
- shl.b32 %r163, %r9, 1;
- mul.lo.s32 %r566, %r4, %r123;
- shl.b32 %r164, %r566, 1;
- add.s32 %r567, %r163, %r164;
- or.b32 %r568, %r567, 1;
- setp.ge.s32 %p202, %r568, %r202;
+ shl.b32 %r162, %r9, 1;
+ mul.lo.s32 %r565, %r4, %r122;
+ shl.b32 %r163, %r565, 1;
+ add.s32 %r566, %r162, %r163;
+ or.b32 %r567, %r566, 1;
+ setp.ge.s32 %p202, %r567, %r201;
@%p202 bra $L__BB0_296;
ld.param.u64 %rd235, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
- add.s32 %r569, %r164, %r163;
+ add.s32 %r568, %r163, %r162;
cvta.to.global.u64 %rd232, %rd235;
- mul.wide.s32 %rd233, %r569, 2;
+ mul.wide.s32 %rd233, %r568, 2;
add.s64 %rd234, %rd232, %rd233;
st.global.v2.u16 [%rd234], {%rs131, %rs132};
$L__BB0_296:
ret;
21: CombinedSchedulerTest.LayerNormBackward/dtype___bfloat_batch_216_hidden_576
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 64
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
blockReduce<true, false, false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<200>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<610>;
.reg .f64 %fd<3>;
.reg .b64 %rd<237>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r212, %r213}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r222, %r223}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r226, %r227}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r248, %r213, 7;
shr.s32 %r249, %r248, 31;
shr.u32 %r250, %r249, 29;
add.s32 %r251, %r248, %r250;
shr.s32 %r2, %r251, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_2;
mov.u32 %r252, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r252;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd44, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r253, [%rd44], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r254, %r4, %r2;
shl.b32 %r255, %r254, 4;
or.b32 %r256, %r255, 15;
and.b32 %r7, %r256, -16;
add.s32 %r257, %r256, %r7;
and.b32 %r258, %r257, -16;
cvt.s64.s32 %rd1, %r258;
shl.b32 %r259, %r4, 2;
max.s32 %r260, %r2, %r3;
mad.lo.s32 %r261, %r259, %r260, 15;
and.b32 %r262, %r261, -16;
cvt.u64.u32 %rd2, %r262;
mov.u64 %rd45, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_103395arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r263, %r8, 7;
setp.lt.s32 %p9, %r263, %r213;
setp.lt.s32 %p10, %r5, %r2;
and.pred %p1, %p9, %p10;
not.pred %p11, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p12, %r9, 0;
or.pred %p13, %p12, %p11;
@%p13 bra $L__BB0_4;
add.s64 %rd47, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r264, smem_ptr; }
// end inline asm
shl.b32 %r267, %r5, 4;
add.s32 %r265, %r264, %r267;
mul.wide.s32 %rd49, %r8, 2;
add.s64 %rd48, %rd37, %rd49;
mov.u32 %r266, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r266, 0;
cp.async.ca.shared.global [%r265], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r562, %r6, 4;
add.s32 %r268, %r4, 215;
div.s32 %r269, %r268, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r270, %r11, %r269;
add.s32 %r271, %r270, -1;
div.s32 %r12, %r271, %r11;
setp.gt.s32 %p14, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p14 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r213;
cvt.s64.s32 %rd50, %r7;
add.s64 %rd51, %rd50, %rd2;
add.s64 %rd53, %rd45, %rd2;
mov.u32 %r273, %ctaid.y;
mul.lo.s32 %r274, %r12, %r4;
mul.lo.s32 %r13, %r274, %r273;
shl.b32 %r275, %r9, 1;
mov.u32 %r276, 1;
shl.b32 %r277, %r5, 4;
mad.lo.s32 %r14, %r275, %r213, %r277;
mul.lo.s32 %r278, %r213, %r9;
cvt.s64.s32 %rd54, %r278;
cvt.s64.s32 %rd55, %r8;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r279, %r13, %r213;
cvt.s64.s32 %rd6, %r279;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r280, %tid.z;
mad.lo.s32 %r281, %r4, %r280, %r9;
mad.lo.s32 %r15, %r281, %r3, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd7, %rd45, %rd56;
clz.b32 %r282, %r3;
mov.u32 %r283, 31;
sub.s32 %r284, %r283, %r282;
shl.b32 %r16, %r276, %r284;
setp.lt.u32 %p15, %r5, %r16;
add.s32 %r285, %r16, %r5;
setp.lt.u32 %p16, %r285, %r3;
and.pred %p2, %p15, %p16;
add.s32 %r286, %r15, %r16;
mul.wide.s32 %rd57, %r286, 4;
add.s64 %rd8, %rd45, %rd57;
shr.u32 %r287, %r16, 31;
add.s32 %r288, %r16, %r287;
shr.s32 %r17, %r288, 1;
add.s32 %r18, %r278, %r8;
add.s64 %rd58, %rd45, %rd51;
mul.wide.s32 %rd59, %r18, 2;
add.s64 %rd9, %rd58, %rd59;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r8, 2;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r289, %r15, 1;
mul.wide.u32 %rd62, %r289, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd59;
mul.wide.s32 %rd63, %r281, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd46, %rd51;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd35;
mov.u32 %r559, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r292, smem_ptr; }
// end inline asm
add.s32 %r293, %r14, %r292;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r295, smem_ptr; }
// end inline asm
add.s32 %r296, %r14, %r295;
not.pred %p22, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r559, %r4;
add.s32 %r290, %r23, %r9;
add.s32 %r24, %r290, %r13;
setp.gt.s32 %p17, %r24, 215;
mov.f32 %f622, %f187;
@%p17 bra $L__BB0_9;
mul.lo.s32 %r291, %r24, %r222;
mul.wide.s32 %rd65, %r291, 4;
add.s64 %rd66, %rd17, %rd65;
ld.global.f32 %f622, [%rd66];
$L__BB0_9:
setp.lt.s32 %p18, %r24, 216;
and.pred %p3, %p1, %p18;
not.pred %p19, %p3;
@%p19 bra $L__BB0_11;
mul.lo.s32 %r557, %r559, %r4;
mul.lo.s32 %r298, %r557, %r213;
cvt.s64.s32 %rd71, %r298;
add.s64 %rd72, %rd5, %rd71;
add.s64 %rd73, %rd72, %rd6;
shl.b64 %rd74, %rd73, 1;
add.s64 %rd68, %rd34, %rd74;
mov.u32 %r297, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r293], [%rd68], 16, p0;
}
// end inline asm
add.s64 %rd70, %rd33, %rd74;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r296], [%rd70], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r555, %r559, %r4;
add.s32 %r554, %r555, %r9;
add.s32 %r553, %r554, %r13;
setp.gt.s32 %p199, %r553, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p199 bra $L__BB0_13;
mul.lo.s32 %r299, %r24, %r226;
mul.wide.s32 %rd75, %r299, 4;
add.s64 %rd76, %rd16, %rd75;
ld.global.f32 %f623, [%rd76];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r304, %r305, %r306, %r307}, [%rd9];
ld.shared.v4.u32 {%r312, %r313, %r314, %r315}, [%rd10];
ld.shared.v4.u32 {%r320, %r321, %r322, %r323}, [%rd12];
mov.b32 {%rs36, %rs39}, %r320;
// begin inline asm
{ mov.b32 %f221, {0,%rs36};}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r312;
// begin inline asm
{ mov.b32 %f222, {0,%rs37};}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r304;
// begin inline asm
{ mov.b32 %f223, {0,%rs38};}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ mov.b32 %f224, {0,%rs39};}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ mov.b32 %f225, {0,%rs40};}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ mov.b32 %f226, {0,%rs41};}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r321;
// begin inline asm
{ mov.b32 %f227, {0,%rs42};}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r313;
// begin inline asm
{ mov.b32 %f228, {0,%rs43};}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r305;
// begin inline asm
{ mov.b32 %f229, {0,%rs44};}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ mov.b32 %f230, {0,%rs45};}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ mov.b32 %f231, {0,%rs46};}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ mov.b32 %f232, {0,%rs47};}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r322;
// begin inline asm
{ mov.b32 %f233, {0,%rs48};}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r314;
// begin inline asm
{ mov.b32 %f234, {0,%rs49};}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r306;
// begin inline asm
{ mov.b32 %f235, {0,%rs50};}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ mov.b32 %f236, {0,%rs51};}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ mov.b32 %f237, {0,%rs52};}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ mov.b32 %f238, {0,%rs53};}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r323;
// begin inline asm
{ mov.b32 %f239, {0,%rs54};}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r315;
// begin inline asm
{ mov.b32 %f240, {0,%rs55};}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r307;
// begin inline asm
{ mov.b32 %f241, {0,%rs56};}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ mov.b32 %f242, {0,%rs57};}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ mov.b32 %f243, {0,%rs58};}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ mov.b32 %f244, {0,%rs59};}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r562, %r562, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p22 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p23, %r16, 4;
bar.sync 0;
@%p23 bra $L__BB0_23;
mov.u32 %r560, %r17;
$L__BB0_20:
setp.ge.u32 %p24, %r5, %r560;
@%p24 bra $L__BB0_22;
add.s32 %r328, %r560, %r15;
mul.wide.s32 %rd77, %r328, 4;
add.s64 %rd79, %rd45, %rd77;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd79];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r36, %r560, 1;
setp.gt.u32 %p25, %r560, 3;
mov.u32 %r560, %r36;
@%p25 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p8 bra $L__BB0_26;
setp.lt.u32 %p27, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p27 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p22 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p198, %r16, 4;
bar.sync 0;
@%p198 bra $L__BB0_33;
mov.u32 %r561, %r17;
$L__BB0_30:
setp.ge.u32 %p30, %r5, %r561;
@%p30 bra $L__BB0_32;
add.s32 %r329, %r561, %r15;
mul.wide.s32 %rd80, %r329, 4;
add.s64 %rd82, %rd45, %rd80;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd82];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r38, %r561, 1;
setp.gt.u32 %p31, %r561, 3;
mov.u32 %r561, %r38;
@%p31 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p8 bra $L__BB0_36;
setp.lt.u32 %p33, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p33 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p8 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p8 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
mul.lo.s32 %r556, %r559, %r4;
ld.shared.v4.u32 {%r338, %r339, %r340, %r341}, [%rd9];
ld.shared.v4.u32 {%r346, %r347, %r348, %r349}, [%rd10];
ld.shared.v4.u32 {%r354, %r355, %r356, %r357}, [%rd12];
mov.b32 {%rs97, %rs101}, %r346;
// begin inline asm
{ mov.b32 %f338, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r354;
// begin inline asm
{ mov.b32 %f339, {0,%rs98};}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r338;
// begin inline asm
{ mov.b32 %f340, {0,%rs99};}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ mov.b32 %f342, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f343, {0,%rs102};}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ mov.b32 %f344, {0,%rs103};}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r334, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r347;
// begin inline asm
{ mov.b32 %f346, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r355;
// begin inline asm
{ mov.b32 %f347, {0,%rs106};}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r339;
// begin inline asm
{ mov.b32 %f348, {0,%rs107};}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ mov.b32 %f350, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f351, {0,%rs110};}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ mov.b32 %f352, {0,%rs111};}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r335, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r348;
// begin inline asm
{ mov.b32 %f354, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r356;
// begin inline asm
{ mov.b32 %f355, {0,%rs114};}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r340;
// begin inline asm
{ mov.b32 %f356, {0,%rs115};}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ mov.b32 %f358, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f359, {0,%rs118};}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ mov.b32 %f360, {0,%rs119};}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r336, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r349;
// begin inline asm
{ mov.b32 %f362, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r357;
// begin inline asm
{ mov.b32 %f363, {0,%rs122};}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r341;
// begin inline asm
{ mov.b32 %f364, {0,%rs123};}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ mov.b32 %f366, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f367, {0,%rs126};}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ mov.b32 %f368, {0,%rs127};}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r337, {%rs124, %rs128};
add.s32 %r362, %r13, %r556;
mad.lo.s32 %r363, %r362, %r213, %r18;
mul.wide.s32 %rd84, %r363, 2;
add.s64 %rd83, %rd38, %rd84;
// begin inline asm
st.global.cs.v4.s32 [%rd83], {%r334,%r335,%r336,%r337};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r559, %r559, 1;
setp.lt.s32 %p37, %r559, %r12;
@%p37 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r364, %tid.z;
mad.lo.s32 %r365, %r4, %r364, %r9;
mad.lo.s32 %r50, %r365, %r3, %r5;
mul.wide.u32 %rd85, %r50, 4;
add.s64 %rd23, %rd45, %rd85;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r366, %r4;
mov.u32 %r367, 31;
sub.s32 %r51, %r367, %r366;
mov.u32 %r368, 1;
shl.b32 %r593, %r368, %r51;
setp.lt.u32 %p38, %r9, %r593;
add.s32 %r369, %r593, %r9;
setp.lt.u32 %p39, %r369, %r4;
and.pred %p4, %p38, %p39;
not.pred %p40, %p4;
@%p40 bra $L__BB0_46;
shl.b32 %r370, %r3, %r51;
add.s32 %r371, %r50, %r370;
mul.wide.s32 %rd87, %r371, 4;
add.s64 %rd89, %rd45, %rd87;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd89];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p41, %r593, 4;
@%p41 bra $L__BB0_51;
mov.u32 %r563, %r593;
$L__BB0_48:
shr.u32 %r54, %r563, 1;
setp.ge.u32 %p42, %r9, %r54;
@%p42 bra $L__BB0_50;
mad.lo.s32 %r372, %r54, %r3, %r50;
mul.wide.s32 %rd90, %r372, 4;
add.s64 %rd92, %rd45, %rd90;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd92];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p43, %r563, 7;
mov.u32 %r563, %r54;
@%p43 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r564, 0;
add.s32 %r374, %r50, %r3;
mul.wide.u32 %rd93, %r374, 4;
add.s64 %rd24, %rd45, %rd93;
@%p12 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p45, %r4, 2;
@%p45 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r564, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p40 bra $L__BB0_57;
shl.b32 %r375, %r3, %r51;
add.s32 %r376, %r50, %r375;
mul.wide.s32 %rd95, %r376, 4;
add.s64 %rd97, %rd45, %rd95;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd97];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p41 bra $L__BB0_62;
mov.u32 %r565, %r593;
$L__BB0_59:
shr.u32 %r58, %r565, 1;
setp.ge.u32 %p48, %r9, %r58;
@%p48 bra $L__BB0_61;
mad.lo.s32 %r377, %r58, %r3, %r50;
mul.wide.s32 %rd98, %r377, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd100];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p49, %r565, 7;
mov.u32 %r565, %r58;
@%p49 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r566, 0;
@%p12 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p51, %r4, 2;
@%p51 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r566, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p40 bra $L__BB0_68;
shl.b32 %r379, %r3, %r51;
add.s32 %r380, %r50, %r379;
mul.wide.s32 %rd101, %r380, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd103];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p41 bra $L__BB0_73;
mov.u32 %r567, %r593;
$L__BB0_70:
shr.u32 %r62, %r567, 1;
setp.ge.u32 %p54, %r9, %r62;
@%p54 bra $L__BB0_72;
mad.lo.s32 %r381, %r62, %r3, %r50;
mul.wide.s32 %rd104, %r381, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd106];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p55, %r567, 7;
mov.u32 %r567, %r62;
@%p55 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r568, 0;
@%p12 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r568, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p40 bra $L__BB0_79;
shl.b32 %r383, %r3, %r51;
add.s32 %r384, %r50, %r383;
mul.wide.s32 %rd107, %r384, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd109];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p41 bra $L__BB0_84;
mov.u32 %r569, %r593;
$L__BB0_81:
shr.u32 %r66, %r569, 1;
setp.ge.u32 %p60, %r9, %r66;
@%p60 bra $L__BB0_83;
mad.lo.s32 %r385, %r66, %r3, %r50;
mul.wide.s32 %rd110, %r385, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd112];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p61, %r569, 7;
mov.u32 %r569, %r66;
@%p61 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r570, 0;
@%p12 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r570, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p40 bra $L__BB0_90;
shl.b32 %r387, %r3, %r51;
add.s32 %r388, %r50, %r387;
mul.wide.s32 %rd113, %r388, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd115];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p41 bra $L__BB0_95;
mov.u32 %r571, %r593;
$L__BB0_92:
shr.u32 %r70, %r571, 1;
setp.ge.u32 %p66, %r9, %r70;
@%p66 bra $L__BB0_94;
mad.lo.s32 %r389, %r70, %r3, %r50;
mul.wide.s32 %rd116, %r389, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd118];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p67, %r571, 7;
mov.u32 %r571, %r70;
@%p67 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r572, 0;
@%p12 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r572, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p40 bra $L__BB0_101;
shl.b32 %r391, %r3, %r51;
add.s32 %r392, %r50, %r391;
mul.wide.s32 %rd119, %r392, 4;
add.s64 %rd121, %rd45, %rd119;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd121];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p41 bra $L__BB0_106;
mov.u32 %r573, %r593;
$L__BB0_103:
shr.u32 %r74, %r573, 1;
setp.ge.u32 %p72, %r9, %r74;
@%p72 bra $L__BB0_105;
mad.lo.s32 %r393, %r74, %r3, %r50;
mul.wide.s32 %rd122, %r393, 4;
add.s64 %rd124, %rd45, %rd122;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd124];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p73, %r573, 7;
mov.u32 %r573, %r74;
@%p73 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r574, 0;
@%p12 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r574, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p40 bra $L__BB0_112;
shl.b32 %r395, %r3, %r51;
add.s32 %r396, %r50, %r395;
mul.wide.s32 %rd125, %r396, 4;
add.s64 %rd127, %rd45, %rd125;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd127];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p41 bra $L__BB0_117;
mov.u32 %r575, %r593;
$L__BB0_114:
shr.u32 %r78, %r575, 1;
setp.ge.u32 %p78, %r9, %r78;
@%p78 bra $L__BB0_116;
mad.lo.s32 %r397, %r78, %r3, %r50;
mul.wide.s32 %rd128, %r397, 4;
add.s64 %rd130, %rd45, %rd128;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd130];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p79, %r575, 7;
mov.u32 %r575, %r78;
@%p79 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r576, 0;
@%p12 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r576, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p40 bra $L__BB0_123;
shl.b32 %r399, %r3, %r51;
add.s32 %r400, %r50, %r399;
mul.wide.s32 %rd131, %r400, 4;
add.s64 %rd133, %rd45, %rd131;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd133];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p41 bra $L__BB0_128;
mov.u32 %r577, %r593;
$L__BB0_125:
shr.u32 %r82, %r577, 1;
setp.ge.u32 %p84, %r9, %r82;
@%p84 bra $L__BB0_127;
mad.lo.s32 %r401, %r82, %r3, %r50;
mul.wide.s32 %rd134, %r401, 4;
add.s64 %rd136, %rd45, %rd134;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd136];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p85, %r577, 7;
mov.u32 %r577, %r82;
@%p85 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r578, 0;
@%p12 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r578, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r85, %r562, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p40 bra $L__BB0_134;
shl.b32 %r403, %r3, %r51;
add.s32 %r404, %r50, %r403;
mul.wide.s32 %rd137, %r404, 4;
add.s64 %rd139, %rd45, %rd137;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd139];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p41 bra $L__BB0_139;
mov.u32 %r579, %r593;
$L__BB0_136:
shr.u32 %r87, %r579, 1;
setp.ge.u32 %p90, %r9, %r87;
@%p90 bra $L__BB0_138;
mad.lo.s32 %r405, %r87, %r3, %r50;
mul.wide.s32 %rd140, %r405, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd142];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p91, %r579, 7;
mov.u32 %r579, %r87;
@%p91 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r580, 0;
@%p12 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r580, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p40 bra $L__BB0_145;
shl.b32 %r407, %r3, %r51;
add.s32 %r408, %r50, %r407;
mul.wide.s32 %rd143, %r408, 4;
add.s64 %rd145, %rd45, %rd143;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd145];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p41 bra $L__BB0_150;
mov.u32 %r581, %r593;
$L__BB0_147:
shr.u32 %r91, %r581, 1;
setp.ge.u32 %p96, %r9, %r91;
@%p96 bra $L__BB0_149;
mad.lo.s32 %r409, %r91, %r3, %r50;
mul.wide.s32 %rd146, %r409, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd148];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p97, %r581, 7;
mov.u32 %r581, %r91;
@%p97 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r582, 0;
@%p12 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r582, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p40 bra $L__BB0_156;
shl.b32 %r411, %r3, %r51;
add.s32 %r412, %r50, %r411;
mul.wide.s32 %rd149, %r412, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd151];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p41 bra $L__BB0_161;
mov.u32 %r583, %r593;
$L__BB0_158:
shr.u32 %r95, %r583, 1;
setp.ge.u32 %p102, %r9, %r95;
@%p102 bra $L__BB0_160;
mad.lo.s32 %r413, %r95, %r3, %r50;
mul.wide.s32 %rd152, %r413, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd154];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p103, %r583, 7;
mov.u32 %r583, %r95;
@%p103 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r584, 0;
@%p12 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r584, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p40 bra $L__BB0_167;
shl.b32 %r415, %r3, %r51;
add.s32 %r416, %r50, %r415;
mul.wide.s32 %rd155, %r416, 4;
add.s64 %rd157, %rd45, %rd155;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd157];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p41 bra $L__BB0_172;
mov.u32 %r585, %r593;
$L__BB0_169:
shr.u32 %r99, %r585, 1;
setp.ge.u32 %p108, %r9, %r99;
@%p108 bra $L__BB0_171;
mad.lo.s32 %r417, %r99, %r3, %r50;
mul.wide.s32 %rd158, %r417, 4;
add.s64 %rd160, %rd45, %rd158;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd160];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p109, %r585, 7;
mov.u32 %r585, %r99;
@%p109 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r586, 0;
@%p12 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r586, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p40 bra $L__BB0_178;
shl.b32 %r419, %r3, %r51;
add.s32 %r420, %r50, %r419;
mul.wide.s32 %rd161, %r420, 4;
add.s64 %rd163, %rd45, %rd161;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd163];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p41 bra $L__BB0_183;
mov.u32 %r587, %r593;
$L__BB0_180:
shr.u32 %r103, %r587, 1;
setp.ge.u32 %p114, %r9, %r103;
@%p114 bra $L__BB0_182;
mad.lo.s32 %r421, %r103, %r3, %r50;
mul.wide.s32 %rd164, %r421, 4;
add.s64 %rd166, %rd45, %rd164;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd166];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p115, %r587, 7;
mov.u32 %r587, %r103;
@%p115 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r588, 0;
@%p12 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r588, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p40 bra $L__BB0_189;
shl.b32 %r423, %r3, %r51;
add.s32 %r424, %r50, %r423;
mul.wide.s32 %rd167, %r424, 4;
add.s64 %rd169, %rd45, %rd167;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd169];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p41 bra $L__BB0_194;
mov.u32 %r589, %r593;
$L__BB0_191:
shr.u32 %r107, %r589, 1;
setp.ge.u32 %p120, %r9, %r107;
@%p120 bra $L__BB0_193;
mad.lo.s32 %r425, %r107, %r3, %r50;
mul.wide.s32 %rd170, %r425, 4;
add.s64 %rd172, %rd45, %rd170;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd172];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p121, %r589, 7;
mov.u32 %r589, %r107;
@%p121 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r590, 0;
@%p12 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r590, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p40 bra $L__BB0_200;
shl.b32 %r427, %r3, %r51;
add.s32 %r428, %r50, %r427;
mul.wide.s32 %rd173, %r428, 4;
add.s64 %rd175, %rd45, %rd173;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd175];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p41 bra $L__BB0_205;
mov.u32 %r591, %r593;
$L__BB0_202:
shr.u32 %r111, %r591, 1;
setp.ge.u32 %p126, %r9, %r111;
@%p126 bra $L__BB0_204;
mad.lo.s32 %r429, %r111, %r3, %r50;
mul.wide.s32 %rd176, %r429, 4;
add.s64 %rd178, %rd45, %rd176;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd178];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p127, %r591, 7;
mov.u32 %r591, %r111;
@%p127 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r592, 0;
@%p12 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r592, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p40 bra $L__BB0_211;
shl.b32 %r431, %r3, %r51;
add.s32 %r432, %r50, %r431;
mul.wide.s32 %rd179, %r432, 4;
add.s64 %rd181, %rd45, %rd179;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd181];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p41 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r115, %r593, 1;
setp.ge.u32 %p132, %r9, %r115;
@%p132 bra $L__BB0_214;
mad.lo.s32 %r433, %r115, %r3, %r50;
mul.wide.s32 %rd182, %r433, 4;
add.s64 %rd184, %rd45, %rd182;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd184];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p133, %r593, 7;
mov.u32 %r593, %r115;
@%p133 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r594, 0;
@%p12 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r594, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p12 bra $L__BB0_226;
shl.b32 %r552, %r5, 3;
mov.u32 %r459, %ctaid.y;
mad.lo.s32 %r460, %r213, %r459, %r552;
add.s32 %r461, %r460, %r85;
mul.wide.s32 %rd191, %r461, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r564,%r566,%r568,%r570};
// end inline asm
add.s32 %r462, %r461, 4;
mul.wide.s32 %rd192, %r462, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r572,%r574,%r576,%r578};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
shl.b32 %r549, %r5, 3;
setp.eq.s32 %p136, %r9, 0;
and.pred %p5, %p136, %p10;
not.pred %p138, %p5;
add.s32 %r435, %r549, 3;
sub.s32 %r118, %r435, %r213;
mov.u32 %r436, %ctaid.y;
mad.lo.s32 %r119, %r213, %r436, %r549;
neg.s32 %r437, %r85;
setp.ge.s32 %p139, %r118, %r437;
or.pred %p140, %p138, %p139;
@%p140 bra $L__BB0_222;
add.s32 %r442, %r119, %r85;
mul.wide.s32 %rd186, %r442, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r564,%r566,%r568,%r570};
// end inline asm
$L__BB0_222:
mov.u32 %r443, -4;
sub.s32 %r444, %r443, %r85;
setp.ge.s32 %p141, %r118, %r444;
or.pred %p143, %p138, %p141;
@%p143 bra $L__BB0_226;
add.s32 %r449, %r119, %r85;
add.s32 %r450, %r449, 4;
mul.wide.s32 %rd188, %r450, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r572,%r574,%r576,%r578};
// end inline asm
$L__BB0_226:
@%p1 bra $L__BB0_233;
bra.uni $L__BB0_227;
$L__BB0_233:
@%p12 bra $L__BB0_235;
shl.b32 %r551, %r5, 3;
shl.b32 %r487, %r562, 5;
mov.u32 %r488, %ctaid.y;
mad.lo.s32 %r489, %r213, %r488, %r551;
add.s32 %r490, %r489, %r487;
mul.wide.s32 %rd199, %r490, 4;
add.s64 %rd197, %rd42, %rd199;
// begin inline asm
st.volatile.global.v4.s32 [%rd197], {%r580,%r582,%r584,%r586};
// end inline asm
add.s32 %r491, %r490, 4;
mul.wide.s32 %rd200, %r491, 4;
add.s64 %rd198, %rd42, %rd200;
// begin inline asm
st.volatile.global.v4.s32 [%rd198], {%r588,%r590,%r592,%r594};
// end inline asm
bra.uni $L__BB0_235;
$L__BB0_227:
shl.b32 %r550, %r5, 3;
setp.eq.s32 %p145, %r9, 0;
and.pred %p6, %p145, %p10;
add.s32 %r463, %r550, 3;
sub.s32 %r120, %r463, %r213;
mov.u32 %r464, %ctaid.y;
mad.lo.s32 %r121, %r213, %r464, %r550;
not.pred %p147, %p6;
@%p147 bra $L__BB0_230;
shl.b32 %r122, %r562, 5;
neg.s32 %r465, %r122;
setp.ge.s32 %p148, %r120, %r465;
@%p148 bra $L__BB0_230;
add.s32 %r470, %r121, %r122;
mul.wide.s32 %rd194, %r470, 4;
add.s64 %rd193, %rd42, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r580,%r582,%r584,%r586};
// end inline asm
$L__BB0_230:
@%p147 bra $L__BB0_235;
shl.b32 %r123, %r562, 5;
mov.u32 %r471, -4;
sub.s32 %r472, %r471, %r123;
setp.ge.s32 %p150, %r120, %r472;
@%p150 bra $L__BB0_235;
add.s32 %r477, %r121, %r123;
add.s32 %r478, %r477, 4;
mul.wide.s32 %rd196, %r478, 4;
add.s64 %rd195, %rd42, %rd196;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r588,%r590,%r592,%r594};
// end inline asm
$L__BB0_235:
mov.u32 %r124, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r492, %r5, %r9;
or.b32 %r494, %r492, %r364;
setp.ne.s32 %p152, %r494, 0;
@%p152 bra $L__BB0_239;
ld.param.u64 %rd236, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd201, %rd236;
mov.u32 %r495, %ctaid.x;
mov.u32 %r496, %ctaid.z;
mov.u32 %r497, %nctaid.x;
mad.lo.s32 %r498, %r496, %r497, %r495;
mul.wide.s32 %rd202, %r498, 8;
add.s64 %rd27, %rd201, %rd202;
add.s32 %r499, %r11, -1;
setp.eq.s32 %p153, %r124, %r499;
cvt.s64.s32 %rd203, %r11;
mov.u64 %rd204, -9223372036854775807;
sub.s64 %rd205, %rd204, %rd203;
selp.b64 %rd206, %rd205, 1, %p153;
atom.global.add.u64 %rd28, [%rd27], %rd206;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.lt.s64 %p154, %rd208, 0;
@%p154 bra $L__BB0_239;
mov.u32 %r595, 8;
$L__BB0_238:
// begin inline asm
nanosleep.u32 %r595;
// end inline asm
setp.lt.u32 %p155, %r595, 256;
selp.u32 %r502, 1, 0, %p155;
shl.b32 %r595, %r595, %r502;
ld.volatile.global.u64 %rd209, [%rd27];
xor.b64 %rd210, %rd209, %rd28;
setp.gt.s64 %p156, %rd210, -1;
@%p156 bra $L__BB0_238;
$L__BB0_239:
ld.param.u64 %rd235, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd234, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_af98ba54_1033910nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
mov.u32 %r504, 1;
add.s32 %r505, %r213, 1;
shr.u32 %r506, %r505, 31;
add.s32 %r507, %r505, %r506;
shr.s32 %r508, %r507, 1;
add.s32 %r509, %r4, %r508;
add.s32 %r510, %r509, -1;
div.s32 %r511, %r510, %r4;
add.s32 %r512, %r11, -1;
add.s32 %r513, %r512, %r511;
div.s32 %r127, %r513, %r11;
add.s32 %r128, %r512, %r3;
shl.b32 %r129, %r9, 1;
shl.b32 %r514, %r4, 1;
mad.lo.s32 %r132, %r514, %r124, %r129;
or.b32 %r130, %r132, 1;
mul.lo.s32 %r131, %r514, %r11;
clz.b32 %r515, %r3;
mov.u32 %r516, 31;
sub.s32 %r517, %r516, %r515;
shl.b32 %r133, %r504, %r517;
setp.lt.u32 %p157, %r5, %r133;
add.s32 %r518, %r133, %r5;
setp.lt.u32 %p158, %r518, %r3;
and.pred %p7, %p157, %p158;
add.s32 %r519, %r50, %r133;
mul.wide.s32 %rd211, %r519, 4;
add.s64 %rd29, %rd45, %rd211;
shr.u32 %r520, %r133, 31;
add.s32 %r521, %r133, %r520;
shr.s32 %r134, %r521, 1;
add.s32 %r522, %r50, 1;
mul.wide.u32 %rd213, %r522, 4;
add.s64 %rd30, %rd45, %rd213;
cvta.to.global.u64 %rd31, %rd234;
cvta.to.global.u64 %rd32, %rd235;
mov.u32 %r596, 0;
not.pred %p184, %p7;
bra.uni $L__BB0_240;
$L__BB0_303:
add.s32 %r596, %r596, 1;
$L__BB0_240:
.pragma "nounroll";
setp.lt.s32 %p159, %r596, %r127;
@%p159 bra $L__BB0_274;
bra.uni $L__BB0_241;
$L__BB0_274:
div.s32 %r160, %r128, %r3;
setp.lt.s32 %p180, %r160, 1;
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p180 bra $L__BB0_280;
mul.lo.s32 %r537, %r131, %r596;
add.s32 %r161, %r130, %r537;
add.s32 %r162, %r132, %r537;
mov.u32 %r536, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r605, %r536;
$L__BB0_276:
.pragma "nounroll";
setp.ge.s32 %p181, %r161, %r213;
mov.u32 %r606, %r536;
mov.u32 %r607, %r536;
@%p181 bra $L__BB0_279;
mad.lo.s32 %r164, %r605, %r3, %r5;
setp.ge.s32 %p182, %r164, %r11;
mov.u32 %r606, %r536;
mov.u32 %r607, %r536;
@%p182 bra $L__BB0_279;
mad.lo.s32 %r544, %r164, %r213, %r162;
mul.wide.s32 %rd225, %r544, 4;
add.s64 %rd224, %rd41, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r607,%r606}, [%rd224];
// end inline asm
$L__BB0_279:
mov.b32 %f584, %r607;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r606;
add.f32 %f684, %f684, %f585;
add.s32 %r605, %r605, 1;
setp.lt.s32 %p183, %r605, %r160;
@%p183 bra $L__BB0_276;
$L__BB0_280:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p184 bra $L__BB0_282;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_282:
setp.lt.s32 %p185, %r133, 4;
bar.sync 0;
@%p185 bra $L__BB0_287;
mov.u32 %r608, %r134;
$L__BB0_284:
setp.ge.u32 %p186, %r5, %r608;
@%p186 bra $L__BB0_286;
add.s32 %r545, %r608, %r50;
mul.wide.s32 %rd226, %r545, 4;
add.s64 %rd228, %rd45, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_286:
bar.sync 0;
shr.u32 %r171, %r608, 1;
setp.gt.u32 %p187, %r608, 3;
mov.u32 %r608, %r171;
@%p187 bra $L__BB0_284;
$L__BB0_287:
mov.f32 %f686, 0f00000000;
@%p8 bra $L__BB0_290;
setp.lt.u32 %p189, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p189 bra $L__BB0_290;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_290:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p184 bra $L__BB0_292;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_292:
bar.sync 0;
@%p185 bra $L__BB0_297;
mov.u32 %r609, %r134;
$L__BB0_294:
setp.ge.u32 %p192, %r5, %r609;
@%p192 bra $L__BB0_296;
add.s32 %r546, %r609, %r50;
mul.wide.s32 %rd229, %r546, 4;
add.s64 %rd231, %rd45, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_296:
bar.sync 0;
shr.u32 %r173, %r609, 1;
setp.gt.u32 %p193, %r609, 3;
mov.u32 %r609, %r173;
@%p193 bra $L__BB0_294;
$L__BB0_297:
mov.f32 %f687, 0f00000000;
@%p8 bra $L__BB0_300;
setp.lt.u32 %p195, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p195 bra $L__BB0_300;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_300:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f687;}
// end inline asm
@%p8 bra $L__BB0_303;
mul.lo.s32 %r174, %r131, %r596;
add.s32 %r547, %r130, %r174;
setp.ge.s32 %p197, %r547, %r213;
@%p197 bra $L__BB0_303;
add.s32 %r548, %r132, %r174;
mul.wide.s32 %rd232, %r548, 2;
add.s64 %rd233, %rd31, %rd232;
st.global.v2.u16 [%rd233], {%rs131, %rs132};
bra.uni $L__BB0_303;
$L__BB0_241:
setp.lt.s32 %p160, %r127, 1;
@%p160 bra $L__BB0_273;
div.s32 %r136, %r128, %r3;
mad.lo.s32 %r137, %r213, %r5, %r129;
shl.b32 %r138, %r124, 1;
shl.b32 %r139, %r11, 1;
mul.lo.s32 %r140, %r213, %r3;
mov.u32 %r597, 0;
$L__BB0_243:
.pragma "nounroll";
setp.lt.s32 %p161, %r136, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_249;
mad.lo.s32 %r142, %r131, %r597, %r130;
mad.lo.s32 %r525, %r139, %r597, %r138;
mad.lo.s32 %r599, %r4, %r525, %r137;
mov.u32 %r524, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r598, %r5;
mov.u32 %r600, %r524;
$L__BB0_245:
.pragma "nounroll";
setp.ge.s32 %p162, %r142, %r213;
mov.u32 %r601, %r524;
mov.u32 %r602, %r524;
@%p162 bra $L__BB0_248;
setp.ge.s32 %p163, %r598, %r11;
mov.u32 %r601, %r524;
mov.u32 %r602, %r524;
@%p163 bra $L__BB0_248;
mul.wide.s32 %rd215, %r599, 4;
add.s64 %rd214, %rd42, %rd215;
// begin inline asm
ld.volatile.global.v2.s32 {%r602,%r601}, [%rd214];
// end inline asm
$L__BB0_248:
mov.b32 %f558, %r602;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r601;
add.f32 %f678, %f678, %f559;
add.s32 %r599, %r599, %r140;
add.s32 %r598, %r598, %r3;
add.s32 %r600, %r600, 1;
setp.lt.s32 %p164, %r600, %r136;
@%p164 bra $L__BB0_245;
$L__BB0_249:
st.shared.f32 [%rd23], %f679;
bar.sync 0;
@%p184 bra $L__BB0_251;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_251:
setp.lt.s32 %p166, %r133, 4;
bar.sync 0;
@%p166 bra $L__BB0_256;
mov.u32 %r603, %r134;
$L__BB0_253:
setp.ge.u32 %p167, %r5, %r603;
@%p167 bra $L__BB0_255;
add.s32 %r532, %r603, %r50;
mul.wide.s32 %rd216, %r532, 4;
add.s64 %rd218, %rd45, %rd216;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd218];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_255:
bar.sync 0;
shr.u32 %r155, %r603, 1;
setp.gt.u32 %p168, %r603, 3;
mov.u32 %r603, %r155;
@%p168 bra $L__BB0_253;
$L__BB0_256:
mov.f32 %f680, 0f00000000;
@%p8 bra $L__BB0_259;
setp.lt.u32 %p170, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p170 bra $L__BB0_259;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_259:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p184 bra $L__BB0_261;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_261:
bar.sync 0;
@%p166 bra $L__BB0_266;
mov.u32 %r604, %r134;
$L__BB0_263:
setp.ge.u32 %p173, %r5, %r604;
@%p173 bra $L__BB0_265;
add.s32 %r533, %r604, %r50;
mul.wide.s32 %rd219, %r533, 4;
add.s64 %rd221, %rd45, %rd219;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd221];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_265:
bar.sync 0;
shr.u32 %r157, %r604, 1;
setp.gt.u32 %p174, %r604, 3;
mov.u32 %r604, %r157;
@%p174 bra $L__BB0_263;
$L__BB0_266:
mov.f32 %f681, 0f00000000;
@%p8 bra $L__BB0_269;
setp.lt.u32 %p176, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p176 bra $L__BB0_269;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_269:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f681;}
// end inline asm
@%p8 bra $L__BB0_272;
mul.lo.s32 %r158, %r131, %r597;
add.s32 %r534, %r130, %r158;
setp.ge.s32 %p178, %r534, %r213;
@%p178 bra $L__BB0_272;
add.s32 %r535, %r132, %r158;
mul.wide.s32 %rd222, %r535, 2;
add.s64 %rd223, %rd32, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_272:
add.s32 %r597, %r597, 1;
setp.lt.s32 %p179, %r597, %r127;
@%p179 bra $L__BB0_243;
$L__BB0_273:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<200>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
.reg .b32 %r<604>;
.reg .f64 %fd<3>;
.reg .b64 %rd<237>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r211, %r212}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r221, %r222}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r225, %r226}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r247, %r212, 7;
shr.s32 %r248, %r247, 31;
shr.u32 %r249, %r248, 29;
add.s32 %r250, %r247, %r249;
shr.s32 %r2, %r250, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_2;
mov.u32 %r251, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r251;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd44, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r252, [%rd44], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r253, %r4, %r2;
shl.b32 %r254, %r253, 4;
or.b32 %r255, %r254, 15;
and.b32 %r7, %r255, -16;
add.s32 %r256, %r255, %r7;
and.b32 %r257, %r256, -16;
cvt.s64.s32 %rd1, %r257;
shl.b32 %r258, %r4, 2;
max.s32 %r259, %r2, %r3;
mad.lo.s32 %r260, %r258, %r259, 15;
and.b32 %r261, %r260, -16;
cvt.u64.u32 %rd2, %r261;
mov.u64 %rd45, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_72335arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r262, %r8, 7;
setp.lt.s32 %p9, %r262, %r212;
setp.lt.s32 %p10, %r5, %r2;
and.pred %p1, %p9, %p10;
not.pred %p11, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p12, %r9, 0;
or.pred %p13, %p12, %p11;
@%p13 bra $L__BB0_4;
add.s64 %rd47, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r263, smem_ptr; }
// end inline asm
shl.b32 %r266, %r5, 4;
add.s32 %r264, %r263, %r266;
mul.wide.s32 %rd49, %r8, 2;
add.s64 %rd48, %rd37, %rd49;
mov.u32 %r265, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r265, 0;
cp.async.ca.shared.global [%r264], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r556, %r6, 4;
add.s32 %r267, %r4, 215;
div.s32 %r268, %r267, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r269, %r11, %r268;
add.s32 %r270, %r269, -1;
div.s32 %r12, %r270, %r11;
setp.gt.s32 %p14, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p14 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r212;
cvt.s64.s32 %rd50, %r7;
add.s64 %rd51, %rd50, %rd2;
add.s64 %rd53, %rd45, %rd2;
mov.u32 %r272, %ctaid.y;
mul.lo.s32 %r273, %r12, %r4;
mul.lo.s32 %r13, %r273, %r272;
mad.lo.s32 %r274, %r2, %r9, %r5;
shl.b32 %r14, %r274, 4;
mul.lo.s32 %r275, %r212, %r9;
cvt.s64.s32 %rd54, %r275;
cvt.s64.s32 %rd55, %r8;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r276, %r13, %r212;
cvt.s64.s32 %rd6, %r276;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r277, %tid.z;
mad.lo.s32 %r278, %r4, %r277, %r9;
mad.lo.s32 %r15, %r278, %r3, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd7, %rd45, %rd56;
clz.b32 %r279, %r3;
mov.u32 %r280, 31;
sub.s32 %r281, %r280, %r279;
mov.u32 %r282, 1;
shl.b32 %r16, %r282, %r281;
setp.lt.u32 %p15, %r5, %r16;
add.s32 %r283, %r16, %r5;
setp.lt.u32 %p16, %r283, %r3;
and.pred %p2, %p15, %p16;
add.s32 %r284, %r15, %r16;
mul.wide.s32 %rd57, %r284, 4;
add.s64 %rd8, %rd45, %rd57;
shr.u32 %r285, %r16, 31;
add.s32 %r286, %r16, %r285;
shr.s32 %r17, %r286, 1;
shl.b32 %r287, %r9, 3;
mad.lo.s32 %r288, %r287, %r2, %r8;
add.s64 %rd58, %rd45, %rd51;
mul.wide.s32 %rd59, %r288, 2;
add.s64 %rd9, %rd58, %rd59;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r8, 2;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r289, %r15, 1;
mul.wide.u32 %rd62, %r289, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd59;
mul.wide.s32 %rd63, %r278, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd46, %rd51;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd35;
mov.u32 %r553, 0;
mov.f32 %f187, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r292, smem_ptr; }
// end inline asm
add.s32 %r293, %r292, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r295, smem_ptr; }
// end inline asm
add.s32 %r296, %r295, %r14;
not.pred %p22, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
mov.f32 %f610, %f187;
mov.f32 %f611, %f187;
mov.f32 %f612, %f187;
mov.f32 %f613, %f187;
mov.f32 %f614, %f187;
mov.f32 %f615, %f187;
mov.f32 %f616, %f187;
mov.f32 %f617, %f187;
mov.f32 %f618, %f187;
mov.f32 %f619, %f187;
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r553, %r4;
add.s32 %r290, %r22, %r9;
add.s32 %r23, %r290, %r13;
setp.gt.s32 %p17, %r23, 215;
mov.f32 %f622, %f187;
@%p17 bra $L__BB0_9;
mul.lo.s32 %r291, %r23, %r221;
mul.wide.s32 %rd65, %r291, 4;
add.s64 %rd66, %rd17, %rd65;
ld.global.f32 %f622, [%rd66];
$L__BB0_9:
setp.lt.s32 %p18, %r23, 216;
and.pred %p3, %p1, %p18;
not.pred %p19, %p3;
@%p19 bra $L__BB0_11;
mul.lo.s32 %r551, %r553, %r4;
mul.lo.s32 %r298, %r551, %r212;
cvt.s64.s32 %rd71, %r298;
add.s64 %rd72, %rd5, %rd71;
add.s64 %rd73, %rd72, %rd6;
shl.b64 %rd74, %rd73, 1;
add.s64 %rd68, %rd34, %rd74;
mov.u32 %r297, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r293], [%rd68], 16, p0;
}
// end inline asm
add.s64 %rd70, %rd33, %rd74;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r297, 0;
cp.async.ca.shared.global [%r296], [%rd70], 16, p0;
}
// end inline asm
$L__BB0_11:
mul.lo.s32 %r550, %r553, %r4;
add.s32 %r549, %r550, %r9;
add.s32 %r548, %r549, %r13;
setp.gt.s32 %p199, %r548, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p199 bra $L__BB0_13;
mul.lo.s32 %r299, %r23, %r225;
mul.wide.s32 %rd75, %r299, 4;
add.s64 %rd76, %rd16, %rd75;
ld.global.f32 %f623, [%rd76];
$L__BB0_13:
mul.f32 %f23, %f623, %f1;
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f189;}
// end inline asm
@%p3 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r304, %r305, %r306, %r307}, [%rd9];
ld.shared.v4.u32 {%r312, %r313, %r314, %r315}, [%rd10];
ld.shared.v4.u32 {%r320, %r321, %r322, %r323}, [%rd12];
mov.b32 {%rs36, %rs39}, %r320;
// begin inline asm
{ mov.b32 %f221, {0,%rs36};}
// end inline asm
add.f32 %f613, %f221, %f613;
mov.b32 {%rs37, %rs40}, %r312;
// begin inline asm
{ mov.b32 %f222, {0,%rs37};}
// end inline asm
mul.f32 %f245, %f221, %f222;
add.f32 %f246, %f245, 0f00000000;
mov.b32 {%rs38, %rs41}, %r304;
// begin inline asm
{ mov.b32 %f223, {0,%rs38};}
// end inline asm
sub.f32 %f247, %f223, %f622;
mul.f32 %f248, %f623, %f247;
fma.rn.f32 %f621, %f221, %f248, %f621;
fma.rn.f32 %f249, %f245, %f248, 0f00000000;
// begin inline asm
{ mov.b32 %f224, {0,%rs39};}
// end inline asm
add.f32 %f612, %f224, %f612;
// begin inline asm
{ mov.b32 %f225, {0,%rs40};}
// end inline asm
mul.f32 %f250, %f224, %f225;
add.f32 %f251, %f246, %f250;
// begin inline asm
{ mov.b32 %f226, {0,%rs41};}
// end inline asm
sub.f32 %f252, %f226, %f622;
mul.f32 %f253, %f623, %f252;
fma.rn.f32 %f620, %f224, %f253, %f620;
fma.rn.f32 %f254, %f250, %f253, %f249;
mov.b32 {%rs42, %rs45}, %r321;
// begin inline asm
{ mov.b32 %f227, {0,%rs42};}
// end inline asm
add.f32 %f611, %f227, %f611;
mov.b32 {%rs43, %rs46}, %r313;
// begin inline asm
{ mov.b32 %f228, {0,%rs43};}
// end inline asm
mul.f32 %f255, %f227, %f228;
add.f32 %f256, %f251, %f255;
mov.b32 {%rs44, %rs47}, %r305;
// begin inline asm
{ mov.b32 %f229, {0,%rs44};}
// end inline asm
sub.f32 %f257, %f229, %f622;
mul.f32 %f258, %f623, %f257;
fma.rn.f32 %f619, %f227, %f258, %f619;
fma.rn.f32 %f259, %f255, %f258, %f254;
// begin inline asm
{ mov.b32 %f230, {0,%rs45};}
// end inline asm
add.f32 %f610, %f230, %f610;
// begin inline asm
{ mov.b32 %f231, {0,%rs46};}
// end inline asm
mul.f32 %f260, %f230, %f231;
add.f32 %f261, %f256, %f260;
// begin inline asm
{ mov.b32 %f232, {0,%rs47};}
// end inline asm
sub.f32 %f262, %f232, %f622;
mul.f32 %f263, %f623, %f262;
fma.rn.f32 %f618, %f230, %f263, %f618;
fma.rn.f32 %f264, %f260, %f263, %f259;
mov.b32 {%rs48, %rs51}, %r322;
// begin inline asm
{ mov.b32 %f233, {0,%rs48};}
// end inline asm
add.f32 %f609, %f233, %f609;
mov.b32 {%rs49, %rs52}, %r314;
// begin inline asm
{ mov.b32 %f234, {0,%rs49};}
// end inline asm
mul.f32 %f265, %f233, %f234;
add.f32 %f266, %f261, %f265;
mov.b32 {%rs50, %rs53}, %r306;
// begin inline asm
{ mov.b32 %f235, {0,%rs50};}
// end inline asm
sub.f32 %f267, %f235, %f622;
mul.f32 %f268, %f623, %f267;
fma.rn.f32 %f617, %f233, %f268, %f617;
fma.rn.f32 %f269, %f265, %f268, %f264;
// begin inline asm
{ mov.b32 %f236, {0,%rs51};}
// end inline asm
add.f32 %f608, %f236, %f608;
// begin inline asm
{ mov.b32 %f237, {0,%rs52};}
// end inline asm
mul.f32 %f270, %f236, %f237;
add.f32 %f271, %f266, %f270;
// begin inline asm
{ mov.b32 %f238, {0,%rs53};}
// end inline asm
sub.f32 %f272, %f238, %f622;
mul.f32 %f273, %f623, %f272;
fma.rn.f32 %f616, %f236, %f273, %f616;
fma.rn.f32 %f274, %f270, %f273, %f269;
mov.b32 {%rs54, %rs57}, %r323;
// begin inline asm
{ mov.b32 %f239, {0,%rs54};}
// end inline asm
add.f32 %f607, %f239, %f607;
mov.b32 {%rs55, %rs58}, %r315;
// begin inline asm
{ mov.b32 %f240, {0,%rs55};}
// end inline asm
mul.f32 %f275, %f239, %f240;
add.f32 %f276, %f271, %f275;
mov.b32 {%rs56, %rs59}, %r307;
// begin inline asm
{ mov.b32 %f241, {0,%rs56};}
// end inline asm
sub.f32 %f277, %f241, %f622;
mul.f32 %f278, %f623, %f277;
fma.rn.f32 %f615, %f239, %f278, %f615;
fma.rn.f32 %f279, %f275, %f278, %f274;
// begin inline asm
{ mov.b32 %f242, {0,%rs57};}
// end inline asm
add.f32 %f606, %f242, %f606;
// begin inline asm
{ mov.b32 %f243, {0,%rs58};}
// end inline asm
mul.f32 %f280, %f242, %f243;
add.f32 %f641, %f276, %f280;
// begin inline asm
{ mov.b32 %f244, {0,%rs59};}
// end inline asm
sub.f32 %f281, %f244, %f622;
mul.f32 %f282, %f623, %f281;
fma.rn.f32 %f614, %f242, %f282, %f614;
fma.rn.f32 %f640, %f280, %f282, %f279;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f640, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f640;}
// end inline asm
mov.f32 %f641, %f640;
$L__BB0_16:
shl.b32 %r556, %r556, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p22 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
ld.shared.f32 %f284, [%rd7];
add.f32 %f285, %f283, %f284;
st.shared.f32 [%rd7], %f285;
$L__BB0_18:
setp.lt.s32 %p23, %r16, 4;
bar.sync 0;
@%p23 bra $L__BB0_23;
mov.u32 %r554, %r17;
$L__BB0_20:
setp.ge.u32 %p24, %r5, %r554;
@%p24 bra $L__BB0_22;
add.s32 %r328, %r554, %r15;
mul.wide.s32 %rd77, %r328, 4;
add.s64 %rd79, %rd45, %rd77;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd79];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
shr.u32 %r35, %r554, 1;
setp.gt.u32 %p25, %r554, 3;
mov.u32 %r554, %r35;
@%p25 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p8 bra $L__BB0_26;
setp.lt.u32 %p27, %r3, 2;
ld.shared.f32 %f290, [%rd7];
add.f32 %f642, %f290, 0f00000000;
@%p27 bra $L__BB0_26;
ld.shared.f32 %f291, [%rd11];
add.f32 %f642, %f642, %f291;
$L__BB0_26:
bar.sync 0;
st.shared.f32 [%rd7], %f640;
bar.sync 0;
@%p22 bra $L__BB0_28;
ld.shared.f32 %f292, [%rd8];
ld.shared.f32 %f293, [%rd7];
add.f32 %f294, %f292, %f293;
st.shared.f32 [%rd7], %f294;
$L__BB0_28:
setp.lt.s32 %p198, %r16, 4;
bar.sync 0;
@%p198 bra $L__BB0_33;
mov.u32 %r555, %r17;
$L__BB0_30:
setp.ge.u32 %p30, %r5, %r555;
@%p30 bra $L__BB0_32;
add.s32 %r329, %r555, %r15;
mul.wide.s32 %rd80, %r329, 4;
add.s64 %rd82, %rd45, %rd80;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd82];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
shr.u32 %r37, %r555, 1;
setp.gt.u32 %p31, %r555, 3;
mov.u32 %r555, %r37;
@%p31 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p8 bra $L__BB0_36;
setp.lt.u32 %p33, %r3, 2;
ld.shared.f32 %f299, [%rd7];
add.f32 %f643, %f299, 0f00000000;
@%p33 bra $L__BB0_36;
ld.shared.f32 %f300, [%rd11];
add.f32 %f643, %f643, %f300;
$L__BB0_36:
bar.sync 0;
@%p8 bra $L__BB0_38;
st.shared.f32 [%rd13], %f642;
$L__BB0_38:
bar.sync 0;
ld.shared.f32 %f66, [%rd13];
bar.sync 0;
@%p8 bra $L__BB0_40;
st.shared.f32 [%rd13], %f643;
$L__BB0_40:
bar.sync 0;
ld.shared.f32 %f67, [%rd13];
bar.sync 0;
mov.f32 %f301, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f301;}
// end inline asm
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
ld.shared.v4.u32 {%r338, %r339, %r340, %r341}, [%rd9];
ld.shared.v4.u32 {%r346, %r347, %r348, %r349}, [%rd10];
ld.shared.v4.u32 {%r354, %r355, %r356, %r357}, [%rd12];
mov.b32 {%rs97, %rs101}, %r346;
// begin inline asm
{ mov.b32 %f338, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r354;
// begin inline asm
{ mov.b32 %f339, {0,%rs98};}
// end inline asm
mul.f32 %f370, %f338, %f339;
mul.f32 %f371, %f370, %f2;
mov.b32 {%rs99, %rs103}, %r338;
// begin inline asm
{ mov.b32 %f340, {0,%rs99};}
// end inline asm
sub.f32 %f372, %f340, %f622;
mul.f32 %f373, %f623, %f372;
sub.f32 %f374, %f371, %f66;
mul.f32 %f375, %f67, %f373;
sub.f32 %f376, %f374, %f375;
mul.f32 %f341, %f23, %f376;
// begin inline asm
{ mov.b32 %f342, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f343, {0,%rs102};}
// end inline asm
mul.f32 %f377, %f342, %f343;
mul.f32 %f378, %f377, %f2;
// begin inline asm
{ mov.b32 %f344, {0,%rs103};}
// end inline asm
sub.f32 %f379, %f344, %f622;
mul.f32 %f380, %f623, %f379;
sub.f32 %f381, %f378, %f66;
mul.f32 %f382, %f67, %f380;
sub.f32 %f383, %f381, %f382;
mul.f32 %f345, %f23, %f383;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f345;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f341;}
// end inline asm
mov.b32 %r334, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r347;
// begin inline asm
{ mov.b32 %f346, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r355;
// begin inline asm
{ mov.b32 %f347, {0,%rs106};}
// end inline asm
mul.f32 %f384, %f346, %f347;
mul.f32 %f385, %f384, %f2;
mov.b32 {%rs107, %rs111}, %r339;
// begin inline asm
{ mov.b32 %f348, {0,%rs107};}
// end inline asm
sub.f32 %f386, %f348, %f622;
mul.f32 %f387, %f623, %f386;
sub.f32 %f388, %f385, %f66;
mul.f32 %f389, %f67, %f387;
sub.f32 %f390, %f388, %f389;
mul.f32 %f349, %f23, %f390;
// begin inline asm
{ mov.b32 %f350, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f351, {0,%rs110};}
// end inline asm
mul.f32 %f391, %f350, %f351;
mul.f32 %f392, %f391, %f2;
// begin inline asm
{ mov.b32 %f352, {0,%rs111};}
// end inline asm
sub.f32 %f393, %f352, %f622;
mul.f32 %f394, %f623, %f393;
sub.f32 %f395, %f392, %f66;
mul.f32 %f396, %f67, %f394;
sub.f32 %f397, %f395, %f396;
mul.f32 %f353, %f23, %f397;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f353;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f349;}
// end inline asm
mov.b32 %r335, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r348;
// begin inline asm
{ mov.b32 %f354, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r356;
// begin inline asm
{ mov.b32 %f355, {0,%rs114};}
// end inline asm
mul.f32 %f398, %f354, %f355;
mul.f32 %f399, %f398, %f2;
mov.b32 {%rs115, %rs119}, %r340;
// begin inline asm
{ mov.b32 %f356, {0,%rs115};}
// end inline asm
sub.f32 %f400, %f356, %f622;
mul.f32 %f401, %f623, %f400;
sub.f32 %f402, %f399, %f66;
mul.f32 %f403, %f67, %f401;
sub.f32 %f404, %f402, %f403;
mul.f32 %f357, %f23, %f404;
// begin inline asm
{ mov.b32 %f358, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f359, {0,%rs118};}
// end inline asm
mul.f32 %f405, %f358, %f359;
mul.f32 %f406, %f405, %f2;
// begin inline asm
{ mov.b32 %f360, {0,%rs119};}
// end inline asm
sub.f32 %f407, %f360, %f622;
mul.f32 %f408, %f623, %f407;
sub.f32 %f409, %f406, %f66;
mul.f32 %f410, %f67, %f408;
sub.f32 %f411, %f409, %f410;
mul.f32 %f361, %f23, %f411;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f361;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f357;}
// end inline asm
mov.b32 %r336, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r349;
// begin inline asm
{ mov.b32 %f362, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r357;
// begin inline asm
{ mov.b32 %f363, {0,%rs122};}
// end inline asm
mul.f32 %f412, %f362, %f363;
mul.f32 %f413, %f412, %f2;
mov.b32 {%rs123, %rs127}, %r341;
// begin inline asm
{ mov.b32 %f364, {0,%rs123};}
// end inline asm
sub.f32 %f414, %f364, %f622;
mul.f32 %f415, %f623, %f414;
sub.f32 %f416, %f413, %f66;
mul.f32 %f417, %f67, %f415;
sub.f32 %f418, %f416, %f417;
mul.f32 %f365, %f23, %f418;
// begin inline asm
{ mov.b32 %f366, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f367, {0,%rs126};}
// end inline asm
mul.f32 %f419, %f366, %f367;
mul.f32 %f420, %f419, %f2;
// begin inline asm
{ mov.b32 %f368, {0,%rs127};}
// end inline asm
sub.f32 %f421, %f368, %f622;
mul.f32 %f422, %f623, %f421;
sub.f32 %f423, %f420, %f66;
mul.f32 %f424, %f67, %f422;
sub.f32 %f425, %f423, %f424;
mul.f32 %f369, %f23, %f425;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f369;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f365;}
// end inline asm
mov.b32 %r337, {%rs124, %rs128};
mad.lo.s32 %r362, %r23, %r212, %r8;
mul.wide.s32 %rd84, %r362, 2;
add.s64 %rd83, %rd38, %rd84;
// begin inline asm
st.global.cs.v4.s32 [%rd83], {%r334,%r335,%r336,%r337};
// end inline asm
bra.uni $L__BB0_43;
$L__BB0_41:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f301;}
// end inline asm
$L__BB0_43:
add.s32 %r553, %r553, 1;
setp.lt.s32 %p37, %r553, %r12;
@%p37 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
mov.f32 %f607, %f606;
mov.f32 %f608, %f606;
mov.f32 %f609, %f606;
mov.f32 %f610, %f606;
mov.f32 %f611, %f606;
mov.f32 %f612, %f606;
mov.f32 %f613, %f606;
mov.f32 %f614, %f606;
mov.f32 %f615, %f606;
mov.f32 %f616, %f606;
mov.f32 %f617, %f606;
mov.f32 %f618, %f606;
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
mov.u32 %r363, %tid.z;
mad.lo.s32 %r364, %r4, %r363, %r9;
mad.lo.s32 %r49, %r364, %r3, %r5;
mul.wide.u32 %rd85, %r49, 4;
add.s64 %rd23, %rd45, %rd85;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
clz.b32 %r365, %r4;
mov.u32 %r366, 31;
sub.s32 %r50, %r366, %r365;
mov.u32 %r367, 1;
shl.b32 %r587, %r367, %r50;
setp.lt.u32 %p38, %r9, %r587;
add.s32 %r368, %r587, %r9;
setp.lt.u32 %p39, %r368, %r4;
and.pred %p4, %p38, %p39;
not.pred %p40, %p4;
@%p40 bra $L__BB0_46;
shl.b32 %r369, %r3, %r50;
add.s32 %r370, %r49, %r369;
mul.wide.s32 %rd87, %r370, 4;
add.s64 %rd89, %rd45, %rd87;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd89];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
setp.lt.s32 %p41, %r587, 4;
@%p41 bra $L__BB0_51;
mov.u32 %r557, %r587;
$L__BB0_48:
shr.u32 %r53, %r557, 1;
setp.ge.u32 %p42, %r9, %r53;
@%p42 bra $L__BB0_50;
mad.lo.s32 %r371, %r53, %r3, %r49;
mul.wide.s32 %rd90, %r371, 4;
add.s64 %rd92, %rd45, %rd90;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd92];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
setp.gt.u32 %p43, %r557, 7;
mov.u32 %r557, %r53;
@%p43 bra $L__BB0_48;
$L__BB0_51:
mov.u32 %r558, 0;
add.s32 %r373, %r49, %r3;
mul.wide.u32 %rd93, %r373, 4;
add.s64 %rd24, %rd45, %rd93;
@%p12 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
setp.lt.u32 %p45, %r4, 2;
@%p45 bra $L__BB0_54;
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
mov.b32 %r558, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p40 bra $L__BB0_57;
shl.b32 %r374, %r3, %r50;
add.s32 %r375, %r49, %r374;
mul.wide.s32 %rd95, %r375, 4;
add.s64 %rd97, %rd45, %rd95;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd97];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p41 bra $L__BB0_62;
mov.u32 %r559, %r587;
$L__BB0_59:
shr.u32 %r57, %r559, 1;
setp.ge.u32 %p48, %r9, %r57;
@%p48 bra $L__BB0_61;
mad.lo.s32 %r376, %r57, %r3, %r49;
mul.wide.s32 %rd98, %r376, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd100];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
setp.gt.u32 %p49, %r559, 7;
mov.u32 %r559, %r57;
@%p49 bra $L__BB0_59;
$L__BB0_62:
mov.u32 %r560, 0;
@%p12 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p51, %r4, 2;
@%p51 bra $L__BB0_65;
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
mov.b32 %r560, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p40 bra $L__BB0_68;
shl.b32 %r378, %r3, %r50;
add.s32 %r379, %r49, %r378;
mul.wide.s32 %rd101, %r379, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd103];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p41 bra $L__BB0_73;
mov.u32 %r561, %r587;
$L__BB0_70:
shr.u32 %r61, %r561, 1;
setp.ge.u32 %p54, %r9, %r61;
@%p54 bra $L__BB0_72;
mad.lo.s32 %r380, %r61, %r3, %r49;
mul.wide.s32 %rd104, %r380, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd106];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
setp.gt.u32 %p55, %r561, 7;
mov.u32 %r561, %r61;
@%p55 bra $L__BB0_70;
$L__BB0_73:
mov.u32 %r562, 0;
@%p12 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_76;
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
mov.b32 %r562, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p40 bra $L__BB0_79;
shl.b32 %r382, %r3, %r50;
add.s32 %r383, %r49, %r382;
mul.wide.s32 %rd107, %r383, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd109];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p41 bra $L__BB0_84;
mov.u32 %r563, %r587;
$L__BB0_81:
shr.u32 %r65, %r563, 1;
setp.ge.u32 %p60, %r9, %r65;
@%p60 bra $L__BB0_83;
mad.lo.s32 %r384, %r65, %r3, %r49;
mul.wide.s32 %rd110, %r384, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd112];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
setp.gt.u32 %p61, %r563, 7;
mov.u32 %r563, %r65;
@%p61 bra $L__BB0_81;
$L__BB0_84:
mov.u32 %r564, 0;
@%p12 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_87;
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
mov.b32 %r564, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p40 bra $L__BB0_90;
shl.b32 %r386, %r3, %r50;
add.s32 %r387, %r49, %r386;
mul.wide.s32 %rd113, %r387, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd115];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p41 bra $L__BB0_95;
mov.u32 %r565, %r587;
$L__BB0_92:
shr.u32 %r69, %r565, 1;
setp.ge.u32 %p66, %r9, %r69;
@%p66 bra $L__BB0_94;
mad.lo.s32 %r388, %r69, %r3, %r49;
mul.wide.s32 %rd116, %r388, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd118];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
setp.gt.u32 %p67, %r565, 7;
mov.u32 %r565, %r69;
@%p67 bra $L__BB0_92;
$L__BB0_95:
mov.u32 %r566, 0;
@%p12 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_98;
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
mov.b32 %r566, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p40 bra $L__BB0_101;
shl.b32 %r390, %r3, %r50;
add.s32 %r391, %r49, %r390;
mul.wide.s32 %rd119, %r391, 4;
add.s64 %rd121, %rd45, %rd119;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd121];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p41 bra $L__BB0_106;
mov.u32 %r567, %r587;
$L__BB0_103:
shr.u32 %r73, %r567, 1;
setp.ge.u32 %p72, %r9, %r73;
@%p72 bra $L__BB0_105;
mad.lo.s32 %r392, %r73, %r3, %r49;
mul.wide.s32 %rd122, %r392, 4;
add.s64 %rd124, %rd45, %rd122;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd124];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
setp.gt.u32 %p73, %r567, 7;
mov.u32 %r567, %r73;
@%p73 bra $L__BB0_103;
$L__BB0_106:
mov.u32 %r568, 0;
@%p12 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_109;
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
mov.b32 %r568, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p40 bra $L__BB0_112;
shl.b32 %r394, %r3, %r50;
add.s32 %r395, %r49, %r394;
mul.wide.s32 %rd125, %r395, 4;
add.s64 %rd127, %rd45, %rd125;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd127];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p41 bra $L__BB0_117;
mov.u32 %r569, %r587;
$L__BB0_114:
shr.u32 %r77, %r569, 1;
setp.ge.u32 %p78, %r9, %r77;
@%p78 bra $L__BB0_116;
mad.lo.s32 %r396, %r77, %r3, %r49;
mul.wide.s32 %rd128, %r396, 4;
add.s64 %rd130, %rd45, %rd128;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd130];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
setp.gt.u32 %p79, %r569, 7;
mov.u32 %r569, %r77;
@%p79 bra $L__BB0_114;
$L__BB0_117:
mov.u32 %r570, 0;
@%p12 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_120;
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
mov.b32 %r570, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p40 bra $L__BB0_123;
shl.b32 %r398, %r3, %r50;
add.s32 %r399, %r49, %r398;
mul.wide.s32 %rd131, %r399, 4;
add.s64 %rd133, %rd45, %rd131;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd133];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p41 bra $L__BB0_128;
mov.u32 %r571, %r587;
$L__BB0_125:
shr.u32 %r81, %r571, 1;
setp.ge.u32 %p84, %r9, %r81;
@%p84 bra $L__BB0_127;
mad.lo.s32 %r400, %r81, %r3, %r49;
mul.wide.s32 %rd134, %r400, 4;
add.s64 %rd136, %rd45, %rd134;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd136];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
setp.gt.u32 %p85, %r571, 7;
mov.u32 %r571, %r81;
@%p85 bra $L__BB0_125;
$L__BB0_128:
mov.u32 %r572, 0;
@%p12 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_131;
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
mov.b32 %r572, %f667;
$L__BB0_132:
bar.sync 0;
shl.b32 %r84, %r556, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p40 bra $L__BB0_134;
shl.b32 %r402, %r3, %r50;
add.s32 %r403, %r49, %r402;
mul.wide.s32 %rd137, %r403, 4;
add.s64 %rd139, %rd45, %rd137;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd139];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p41 bra $L__BB0_139;
mov.u32 %r573, %r587;
$L__BB0_136:
shr.u32 %r86, %r573, 1;
setp.ge.u32 %p90, %r9, %r86;
@%p90 bra $L__BB0_138;
mad.lo.s32 %r404, %r86, %r3, %r49;
mul.wide.s32 %rd140, %r404, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd142];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
setp.gt.u32 %p91, %r573, 7;
mov.u32 %r573, %r86;
@%p91 bra $L__BB0_136;
$L__BB0_139:
mov.u32 %r574, 0;
@%p12 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_142;
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
mov.b32 %r574, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p40 bra $L__BB0_145;
shl.b32 %r406, %r3, %r50;
add.s32 %r407, %r49, %r406;
mul.wide.s32 %rd143, %r407, 4;
add.s64 %rd145, %rd45, %rd143;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd145];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p41 bra $L__BB0_150;
mov.u32 %r575, %r587;
$L__BB0_147:
shr.u32 %r90, %r575, 1;
setp.ge.u32 %p96, %r9, %r90;
@%p96 bra $L__BB0_149;
mad.lo.s32 %r408, %r90, %r3, %r49;
mul.wide.s32 %rd146, %r408, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd148];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
setp.gt.u32 %p97, %r575, 7;
mov.u32 %r575, %r90;
@%p97 bra $L__BB0_147;
$L__BB0_150:
mov.u32 %r576, 0;
@%p12 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_153;
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
mov.b32 %r576, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p40 bra $L__BB0_156;
shl.b32 %r410, %r3, %r50;
add.s32 %r411, %r49, %r410;
mul.wide.s32 %rd149, %r411, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd151];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p41 bra $L__BB0_161;
mov.u32 %r577, %r587;
$L__BB0_158:
shr.u32 %r94, %r577, 1;
setp.ge.u32 %p102, %r9, %r94;
@%p102 bra $L__BB0_160;
mad.lo.s32 %r412, %r94, %r3, %r49;
mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd154];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
setp.gt.u32 %p103, %r577, 7;
mov.u32 %r577, %r94;
@%p103 bra $L__BB0_158;
$L__BB0_161:
mov.u32 %r578, 0;
@%p12 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_164;
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
mov.b32 %r578, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p40 bra $L__BB0_167;
shl.b32 %r414, %r3, %r50;
add.s32 %r415, %r49, %r414;
mul.wide.s32 %rd155, %r415, 4;
add.s64 %rd157, %rd45, %rd155;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd157];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p41 bra $L__BB0_172;
mov.u32 %r579, %r587;
$L__BB0_169:
shr.u32 %r98, %r579, 1;
setp.ge.u32 %p108, %r9, %r98;
@%p108 bra $L__BB0_171;
mad.lo.s32 %r416, %r98, %r3, %r49;
mul.wide.s32 %rd158, %r416, 4;
add.s64 %rd160, %rd45, %rd158;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd160];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
setp.gt.u32 %p109, %r579, 7;
mov.u32 %r579, %r98;
@%p109 bra $L__BB0_169;
$L__BB0_172:
mov.u32 %r580, 0;
@%p12 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_175;
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
mov.b32 %r580, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p40 bra $L__BB0_178;
shl.b32 %r418, %r3, %r50;
add.s32 %r419, %r49, %r418;
mul.wide.s32 %rd161, %r419, 4;
add.s64 %rd163, %rd45, %rd161;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd163];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p41 bra $L__BB0_183;
mov.u32 %r581, %r587;
$L__BB0_180:
shr.u32 %r102, %r581, 1;
setp.ge.u32 %p114, %r9, %r102;
@%p114 bra $L__BB0_182;
mad.lo.s32 %r420, %r102, %r3, %r49;
mul.wide.s32 %rd164, %r420, 4;
add.s64 %rd166, %rd45, %rd164;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd166];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
setp.gt.u32 %p115, %r581, 7;
mov.u32 %r581, %r102;
@%p115 bra $L__BB0_180;
$L__BB0_183:
mov.u32 %r582, 0;
@%p12 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_186;
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
mov.b32 %r582, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p40 bra $L__BB0_189;
shl.b32 %r422, %r3, %r50;
add.s32 %r423, %r49, %r422;
mul.wide.s32 %rd167, %r423, 4;
add.s64 %rd169, %rd45, %rd167;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd169];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p41 bra $L__BB0_194;
mov.u32 %r583, %r587;
$L__BB0_191:
shr.u32 %r106, %r583, 1;
setp.ge.u32 %p120, %r9, %r106;
@%p120 bra $L__BB0_193;
mad.lo.s32 %r424, %r106, %r3, %r49;
mul.wide.s32 %rd170, %r424, 4;
add.s64 %rd172, %rd45, %rd170;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd172];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
setp.gt.u32 %p121, %r583, 7;
mov.u32 %r583, %r106;
@%p121 bra $L__BB0_191;
$L__BB0_194:
mov.u32 %r584, 0;
@%p12 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_197;
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
mov.b32 %r584, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p40 bra $L__BB0_200;
shl.b32 %r426, %r3, %r50;
add.s32 %r427, %r49, %r426;
mul.wide.s32 %rd173, %r427, 4;
add.s64 %rd175, %rd45, %rd173;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd175];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p41 bra $L__BB0_205;
mov.u32 %r585, %r587;
$L__BB0_202:
shr.u32 %r110, %r585, 1;
setp.ge.u32 %p126, %r9, %r110;
@%p126 bra $L__BB0_204;
mad.lo.s32 %r428, %r110, %r3, %r49;
mul.wide.s32 %rd176, %r428, 4;
add.s64 %rd178, %rd45, %rd176;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd178];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
setp.gt.u32 %p127, %r585, 7;
mov.u32 %r585, %r110;
@%p127 bra $L__BB0_202;
$L__BB0_205:
mov.u32 %r586, 0;
@%p12 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_208;
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
mov.b32 %r586, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p40 bra $L__BB0_211;
shl.b32 %r430, %r3, %r50;
add.s32 %r431, %r49, %r430;
mul.wide.s32 %rd179, %r431, 4;
add.s64 %rd181, %rd45, %rd179;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd181];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
$L__BB0_211:
bar.sync 0;
@%p41 bra $L__BB0_215;
$L__BB0_212:
shr.u32 %r114, %r587, 1;
setp.ge.u32 %p132, %r9, %r114;
@%p132 bra $L__BB0_214;
mad.lo.s32 %r432, %r114, %r3, %r49;
mul.wide.s32 %rd182, %r432, 4;
add.s64 %rd184, %rd45, %rd182;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd184];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
setp.gt.u32 %p133, %r587, 7;
mov.u32 %r587, %r114;
@%p133 bra $L__BB0_212;
$L__BB0_215:
mov.u32 %r588, 0;
@%p12 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_218;
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
mov.b32 %r588, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p12 bra $L__BB0_226;
mov.u32 %r458, %ctaid.y;
mad.lo.s32 %r459, %r212, %r458, %r8;
add.s32 %r460, %r459, %r84;
mul.wide.s32 %rd191, %r460, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r558,%r560,%r562,%r564};
// end inline asm
add.s32 %r461, %r460, 4;
mul.wide.s32 %rd192, %r461, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r566,%r568,%r570,%r572};
// end inline asm
bra.uni $L__BB0_226;
$L__BB0_220:
setp.eq.s32 %p136, %r9, 0;
and.pred %p5, %p136, %p10;
not.pred %p138, %p5;
add.s32 %r434, %r8, 3;
sub.s32 %r117, %r434, %r212;
mov.u32 %r435, %ctaid.y;
mad.lo.s32 %r118, %r212, %r435, %r8;
neg.s32 %r436, %r84;
setp.ge.s32 %p139, %r117, %r436;
or.pred %p140, %p138, %p139;
@%p140 bra $L__BB0_222;
add.s32 %r441, %r118, %r84;
mul.wide.s32 %rd186, %r441, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r558,%r560,%r562,%r564};
// end inline asm
$L__BB0_222:
mov.u32 %r442, -4;
sub.s32 %r443, %r442, %r84;
setp.ge.s32 %p141, %r117, %r443;
or.pred %p143, %p138, %p141;
@%p143 bra $L__BB0_226;
add.s32 %r448, %r118, %r84;
add.s32 %r449, %r448, 4;
mul.wide.s32 %rd188, %r449, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r566,%r568,%r570,%r572};
// end inline asm
$L__BB0_226:
@%p1 bra $L__BB0_233;
bra.uni $L__BB0_227;
$L__BB0_233:
@%p12 bra $L__BB0_235;
shl.b32 %r486, %r556, 5;
mov.u32 %r487, %ctaid.y;
mad.lo.s32 %r488, %r212, %r487, %r8;
add.s32 %r489, %r488, %r486;
mul.wide.s32 %rd199, %r489, 4;
add.s64 %rd197, %rd42, %rd199;
// begin inline asm
st.volatile.global.v4.s32 [%rd197], {%r574,%r576,%r578,%r580};
// end inline asm
add.s32 %r490, %r489, 4;
mul.wide.s32 %rd200, %r490, 4;
add.s64 %rd198, %rd42, %rd200;
// begin inline asm
st.volatile.global.v4.s32 [%rd198], {%r582,%r584,%r586,%r588};
// end inline asm
bra.uni $L__BB0_235;
$L__BB0_227:
setp.eq.s32 %p145, %r9, 0;
and.pred %p6, %p145, %p10;
add.s32 %r462, %r8, 3;
sub.s32 %r119, %r462, %r212;
mov.u32 %r463, %ctaid.y;
mad.lo.s32 %r120, %r212, %r463, %r8;
not.pred %p147, %p6;
@%p147 bra $L__BB0_230;
shl.b32 %r121, %r556, 5;
neg.s32 %r464, %r121;
setp.ge.s32 %p148, %r119, %r464;
@%p148 bra $L__BB0_230;
add.s32 %r469, %r120, %r121;
mul.wide.s32 %rd194, %r469, 4;
add.s64 %rd193, %rd42, %rd194;
// begin inline asm
st.volatile.global.v4.s32 [%rd193], {%r574,%r576,%r578,%r580};
// end inline asm
$L__BB0_230:
@%p147 bra $L__BB0_235;
shl.b32 %r122, %r556, 5;
mov.u32 %r470, -4;
sub.s32 %r471, %r470, %r122;
setp.ge.s32 %p150, %r119, %r471;
@%p150 bra $L__BB0_235;
add.s32 %r476, %r120, %r122;
add.s32 %r477, %r476, 4;
mul.wide.s32 %rd196, %r477, 4;
add.s64 %rd195, %rd42, %rd196;
// begin inline asm
st.volatile.global.v4.s32 [%rd195], {%r582,%r584,%r586,%r588};
// end inline asm
$L__BB0_235:
mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r491, %r5, %r9;
or.b32 %r493, %r491, %r363;
setp.ne.s32 %p152, %r493, 0;
@%p152 bra $L__BB0_239;
ld.param.u64 %rd236, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd201, %rd236;
mov.u32 %r494, %ctaid.x;
mov.u32 %r495, %ctaid.z;
mov.u32 %r496, %nctaid.x;
mad.lo.s32 %r497, %r495, %r496, %r494;
mul.wide.s32 %rd202, %r497, 8;
add.s64 %rd27, %rd201, %rd202;
add.s32 %r498, %r11, -1;
setp.eq.s32 %p153, %r123, %r498;
cvt.s64.s32 %rd203, %r11;
mov.u64 %rd204, -9223372036854775807;
sub.s64 %rd205, %rd204, %rd203;
selp.b64 %rd206, %rd205, 1, %p153;
atom.global.add.u64 %rd28, [%rd27], %rd206;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.lt.s64 %p154, %rd208, 0;
@%p154 bra $L__BB0_239;
mov.u32 %r589, 8;
$L__BB0_238:
// begin inline asm
nanosleep.u32 %r589;
// end inline asm
setp.lt.u32 %p155, %r589, 256;
selp.u32 %r501, 1, 0, %p155;
shl.b32 %r589, %r589, %r501;
ld.volatile.global.u64 %rd209, [%rd27];
xor.b64 %rd210, %rd209, %rd28;
setp.gt.s64 %p156, %rd210, -1;
@%p156 bra $L__BB0_238;
$L__BB0_239:
ld.param.u64 %rd235, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd234, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_43_cu_1d87bf9c_723310nvfuser_43ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
mov.u32 %r503, 1;
add.s32 %r504, %r212, 1;
shr.u32 %r505, %r504, 31;
add.s32 %r506, %r504, %r505;
shr.s32 %r507, %r506, 1;
add.s32 %r508, %r4, %r507;
add.s32 %r509, %r508, -1;
div.s32 %r510, %r509, %r4;
add.s32 %r511, %r11, -1;
add.s32 %r512, %r511, %r510;
div.s32 %r126, %r512, %r11;
add.s32 %r127, %r511, %r3;
shl.b32 %r128, %r9, 1;
shl.b32 %r513, %r4, 1;
mad.lo.s32 %r131, %r513, %r123, %r128;
or.b32 %r129, %r131, 1;
mul.lo.s32 %r130, %r513, %r11;
clz.b32 %r514, %r3;
mov.u32 %r515, 31;
sub.s32 %r516, %r515, %r514;
shl.b32 %r132, %r503, %r516;
setp.lt.u32 %p157, %r5, %r132;
add.s32 %r517, %r132, %r5;
setp.lt.u32 %p158, %r517, %r3;
and.pred %p7, %p157, %p158;
add.s32 %r518, %r49, %r132;
mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd45, %rd211;
shr.u32 %r519, %r132, 31;
add.s32 %r520, %r132, %r519;
shr.s32 %r133, %r520, 1;
add.s32 %r521, %r49, 1;
mul.wide.u32 %rd213, %r521, 4;
add.s64 %rd30, %rd45, %rd213;
cvta.to.global.u64 %rd31, %rd234;
cvta.to.global.u64 %rd32, %rd235;
mov.u32 %r590, 0;
not.pred %p184, %p7;
bra.uni $L__BB0_240;
$L__BB0_303:
add.s32 %r590, %r590, 1;
$L__BB0_240:
.pragma "nounroll";
setp.lt.s32 %p159, %r590, %r126;
@%p159 bra $L__BB0_274;
bra.uni $L__BB0_241;
$L__BB0_274:
div.s32 %r159, %r127, %r3;
setp.lt.s32 %p180, %r159, 1;
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p180 bra $L__BB0_280;
mul.lo.s32 %r536, %r130, %r590;
add.s32 %r160, %r129, %r536;
add.s32 %r161, %r131, %r536;
mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
mov.u32 %r599, %r535;
$L__BB0_276:
.pragma "nounroll";
setp.ge.s32 %p181, %r160, %r212;
mov.u32 %r600, %r535;
mov.u32 %r601, %r535;
@%p181 bra $L__BB0_279;
mad.lo.s32 %r163, %r599, %r3, %r5;
setp.ge.s32 %p182, %r163, %r11;
mov.u32 %r600, %r535;
mov.u32 %r601, %r535;
@%p182 bra $L__BB0_279;
mad.lo.s32 %r543, %r163, %r212, %r161;
mul.wide.s32 %rd225, %r543, 4;
add.s64 %rd224, %rd41, %rd225;
// begin inline asm
ld.volatile.global.v2.s32 {%r601,%r600}, [%rd224];
// end inline asm
$L__BB0_279:
mov.b32 %f584, %r601;
add.f32 %f685, %f685, %f584;
mov.b32 %f585, %r600;
add.f32 %f684, %f684, %f585;
add.s32 %r599, %r599, 1;
setp.lt.s32 %p183, %r599, %r159;
@%p183 bra $L__BB0_276;
$L__BB0_280:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@%p184 bra $L__BB0_282;
ld.shared.f32 %f586, [%rd29];
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_282:
setp.lt.s32 %p185, %r132, 4;
bar.sync 0;
@%p185 bra $L__BB0_287;
mov.u32 %r602, %r133;
$L__BB0_284:
setp.ge.u32 %p186, %r5, %r602;
@%p186 bra $L__BB0_286;
add.s32 %r544, %r602, %r49;
mul.wide.s32 %rd226, %r544, 4;
add.s64 %rd228, %rd45, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_286:
bar.sync 0;
shr.u32 %r170, %r602, 1;
setp.gt.u32 %p187, %r602, 3;
mov.u32 %r602, %r170;
@%p187 bra $L__BB0_284;
$L__BB0_287:
mov.f32 %f686, 0f00000000;
@%p8 bra $L__BB0_290;
setp.lt.u32 %p189, %r3, 2;
ld.shared.f32 %f593, [%rd23];
add.f32 %f686, %f593, 0f00000000;
@%p189 bra $L__BB0_290;
ld.shared.f32 %f594, [%rd30];
add.f32 %f686, %f686, %f594;
$L__BB0_290:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f686;}
// end inline asm
st.shared.f32 [%rd23], %f684;
bar.sync 0;
@%p184 bra $L__BB0_292;
ld.shared.f32 %f596, [%rd29];
ld.shared.f32 %f597, [%rd23];
add.f32 %f598, %f596, %f597;
st.shared.f32 [%rd23], %f598;
$L__BB0_292:
bar.sync 0;
@%p185 bra $L__BB0_297;
mov.u32 %r603, %r133;
$L__BB0_294:
setp.ge.u32 %p192, %r5, %r603;
@%p192 bra $L__BB0_296;
add.s32 %r545, %r603, %r49;
mul.wide.s32 %rd229, %r545, 4;
add.s64 %rd231, %rd45, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_296:
bar.sync 0;
shr.u32 %r172, %r603, 1;
setp.gt.u32 %p193, %r603, 3;
mov.u32 %r603, %r172;
@%p193 bra $L__BB0_294;
$L__BB0_297:
mov.f32 %f687, 0f00000000;
@%p8 bra $L__BB0_300;
setp.lt.u32 %p195, %r3, 2;
ld.shared.f32 %f603, [%rd23];
add.f32 %f687, %f603, 0f00000000;
@%p195 bra $L__BB0_300;
ld.shared.f32 %f604, [%rd30];
add.f32 %f687, %f687, %f604;
$L__BB0_300:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f687;}
// end inline asm
@%p8 bra $L__BB0_303;
mul.lo.s32 %r173, %r130, %r590;
add.s32 %r546, %r129, %r173;
setp.ge.s32 %p197, %r546, %r212;
@%p197 bra $L__BB0_303;
add.s32 %r547, %r131, %r173;
mul.wide.s32 %rd232, %r547, 2;
add.s64 %rd233, %rd31, %rd232;
st.global.v2.u16 [%rd233], {%rs131, %rs132};
bra.uni $L__BB0_303;
$L__BB0_241:
setp.lt.s32 %p160, %r126, 1;
@%p160 bra $L__BB0_273;
div.s32 %r135, %r127, %r3;
mad.lo.s32 %r136, %r212, %r5, %r128;
shl.b32 %r137, %r123, 1;
shl.b32 %r138, %r11, 1;
mul.lo.s32 %r139, %r212, %r3;
mov.u32 %r591, 0;
$L__BB0_243:
.pragma "nounroll";
setp.lt.s32 %p161, %r135, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_249;
mad.lo.s32 %r141, %r130, %r591, %r129;
mad.lo.s32 %r524, %r138, %r591, %r137;
mad.lo.s32 %r593, %r4, %r524, %r136;
mov.u32 %r523, 0;
mov.f32 %f678, 0f00000000;
mov.u32 %r592, %r5;
mov.u32 %r594, %r523;
$L__BB0_245:
.pragma "nounroll";
setp.ge.s32 %p162, %r141, %r212;
mov.u32 %r595, %r523;
mov.u32 %r596, %r523;
@%p162 bra $L__BB0_248;
setp.ge.s32 %p163, %r592, %r11;
mov.u32 %r595, %r523;
mov.u32 %r596, %r523;
@%p163 bra $L__BB0_248;
mul.wide.s32 %rd215, %r593, 4;
add.s64 %rd214, %rd42, %rd215;
// begin inline asm
ld.volatile.global.v2.s32 {%r596,%r595}, [%rd214];
// end inline asm
$L__BB0_248:
mov.b32 %f558, %r596;
add.f32 %f679, %f679, %f558;
mov.b32 %f559, %r595;
add.f32 %f678, %f678, %f559;
add.s32 %r593, %r593, %r139;
add.s32 %r592, %r592, %r3;
add.s32 %r594, %r594, 1;
setp.lt.s32 %p164, %r594, %r135;
@%p164 bra $L__BB0_245;
$L__BB0_249:
st.shared.f32 [%rd23], %f679;
bar.sync 0;
@%p184 bra $L__BB0_251;
ld.shared.f32 %f560, [%rd29];
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_251:
setp.lt.s32 %p166, %r132, 4;
bar.sync 0;
@%p166 bra $L__BB0_256;
mov.u32 %r597, %r133;
$L__BB0_253:
setp.ge.u32 %p167, %r5, %r597;
@%p167 bra $L__BB0_255;
add.s32 %r531, %r597, %r49;
mul.wide.s32 %rd216, %r531, 4;
add.s64 %rd218, %rd45, %rd216;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd218];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_255:
bar.sync 0;
shr.u32 %r154, %r597, 1;
setp.gt.u32 %p168, %r597, 3;
mov.u32 %r597, %r154;
@%p168 bra $L__BB0_253;
$L__BB0_256:
mov.f32 %f680, 0f00000000;
@%p8 bra $L__BB0_259;
setp.lt.u32 %p170, %r3, 2;
ld.shared.f32 %f567, [%rd23];
add.f32 %f680, %f567, 0f00000000;
@%p170 bra $L__BB0_259;
ld.shared.f32 %f568, [%rd30];
add.f32 %f680, %f680, %f568;
$L__BB0_259:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f680;}
// end inline asm
st.shared.f32 [%rd23], %f678;
bar.sync 0;
@%p184 bra $L__BB0_261;
ld.shared.f32 %f570, [%rd29];
ld.shared.f32 %f571, [%rd23];
add.f32 %f572, %f570, %f571;
st.shared.f32 [%rd23], %f572;
$L__BB0_261:
bar.sync 0;
@%p166 bra $L__BB0_266;
mov.u32 %r598, %r133;
$L__BB0_263:
setp.ge.u32 %p173, %r5, %r598;
@%p173 bra $L__BB0_265;
add.s32 %r532, %r598, %r49;
mul.wide.s32 %rd219, %r532, 4;
add.s64 %rd221, %rd45, %rd219;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd221];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_265:
bar.sync 0;
shr.u32 %r156, %r598, 1;
setp.gt.u32 %p174, %r598, 3;
mov.u32 %r598, %r156;
@%p174 bra $L__BB0_263;
$L__BB0_266:
mov.f32 %f681, 0f00000000;
@%p8 bra $L__BB0_269;
setp.lt.u32 %p176, %r3, 2;
ld.shared.f32 %f577, [%rd23];
add.f32 %f681, %f577, 0f00000000;
@%p176 bra $L__BB0_269;
ld.shared.f32 %f578, [%rd30];
add.f32 %f681, %f681, %f578;
$L__BB0_269:
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f681;}
// end inline asm
@%p8 bra $L__BB0_272;
mul.lo.s32 %r157, %r130, %r591;
add.s32 %r533, %r129, %r157;
setp.ge.s32 %p178, %r533, %r212;
@%p178 bra $L__BB0_272;
add.s32 %r534, %r131, %r157;
mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd32, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_272:
add.s32 %r591, %r591, 1;
setp.lt.s32 %p179, %r591, %r126;
@%p179 bra $L__BB0_243;
$L__BB0_273:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,173 +32,173 @@
)
{
.reg .pred %p<200>;
.reg .b16 %rs<133>;
.reg .f32 %f<688>;
- .reg .b32 %r<610>;
+ .reg .b32 %r<604>;
.reg .f64 %fd<3>;
.reg .b64 %rd<237>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r212, %r213}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r222, %r223}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r226, %r227}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r221, %r222}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r225, %r226}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd42, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd38, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r248, %r213, 7;
- shr.s32 %r249, %r248, 31;
- shr.u32 %r250, %r249, 29;
- add.s32 %r251, %r248, %r250;
- shr.s32 %r2, %r251, 3;
+ add.s32 %r247, %r212, 7;
+ shr.s32 %r248, %r247, 31;
+ shr.u32 %r249, %r248, 29;
+ add.s32 %r250, %r247, %r249;
+ shr.s32 %r2, %r250, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p8, %r5, 0;
@%p8 bra $L__BB0_2;
- mov.u32 %r252, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r252;
+ mov.u32 %r251, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r251;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd44, _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r253, [%rd44], %r5;
+ atom.shared.min.s32 %r252, [%rd44], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r254, %r4, %r2;
- shl.b32 %r255, %r254, 4;
- or.b32 %r256, %r255, 15;
- and.b32 %r7, %r256, -16;
- add.s32 %r257, %r256, %r7;
- and.b32 %r258, %r257, -16;
- cvt.s64.s32 %rd1, %r258;
- shl.b32 %r259, %r4, 2;
- max.s32 %r260, %r2, %r3;
- mad.lo.s32 %r261, %r259, %r260, 15;
- and.b32 %r262, %r261, -16;
- cvt.u64.u32 %rd2, %r262;
+ mul.lo.s32 %r253, %r4, %r2;
+ shl.b32 %r254, %r253, 4;
+ or.b32 %r255, %r254, 15;
+ and.b32 %r7, %r255, -16;
+ add.s32 %r256, %r255, %r7;
+ and.b32 %r257, %r256, -16;
+ cvt.s64.s32 %rd1, %r257;
+ shl.b32 %r258, %r4, 2;
+ max.s32 %r259, %r2, %r3;
+ mad.lo.s32 %r260, %r258, %r259, 15;
+ and.b32 %r261, %r260, -16;
+ cvt.u64.u32 %rd2, %r261;
mov.u64 %rd45, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r263, %r8, 7;
- setp.lt.s32 %p9, %r263, %r213;
+ or.b32 %r262, %r8, 7;
+ setp.lt.s32 %p9, %r262, %r212;
setp.lt.s32 %p10, %r5, %r2;
and.pred %p1, %p9, %p10;
not.pred %p11, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p12, %r9, 0;
or.pred %p13, %p12, %p11;
@%p13 bra $L__BB0_4;
add.s64 %rd47, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r264, smem_ptr; }
-
-
- shl.b32 %r267, %r5, 4;
- add.s32 %r265, %r264, %r267;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r263, smem_ptr; }
+
+
+ shl.b32 %r266, %r5, 4;
+ add.s32 %r264, %r263, %r266;
mul.wide.s32 %rd49, %r8, 2;
add.s64 %rd48, %rd37, %rd49;
- mov.u32 %r266, 0;
+ mov.u32 %r265, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r266, 0;
- cp.async.ca.shared.global [%r265], [%rd48], 16, p0;
+ setp.ne.b32 p0, %r265, 0;
+ cp.async.ca.shared.global [%r264], [%rd48], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r562, %r6, 4;
- add.s32 %r268, %r4, 215;
- div.s32 %r269, %r268, %r4;
+ shl.b32 %r556, %r6, 4;
+ add.s32 %r267, %r4, 215;
+ div.s32 %r268, %r267, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r270, %r11, %r269;
- add.s32 %r271, %r270, -1;
- div.s32 %r12, %r271, %r11;
+ add.s32 %r269, %r11, %r268;
+ add.s32 %r270, %r269, -1;
+ div.s32 %r12, %r270, %r11;
setp.gt.s32 %p14, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p14 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r213;
+ cvt.rn.f64.s32 %fd1, %r212;
cvt.s64.s32 %rd50, %r7;
add.s64 %rd51, %rd50, %rd2;
add.s64 %rd53, %rd45, %rd2;
- mov.u32 %r273, %ctaid.y;
- mul.lo.s32 %r274, %r12, %r4;
- mul.lo.s32 %r13, %r274, %r273;
- shl.b32 %r275, %r9, 1;
- mov.u32 %r276, 1;
- shl.b32 %r277, %r5, 4;
- mad.lo.s32 %r14, %r275, %r213, %r277;
- mul.lo.s32 %r278, %r213, %r9;
- cvt.s64.s32 %rd54, %r278;
+ mov.u32 %r272, %ctaid.y;
+ mul.lo.s32 %r273, %r12, %r4;
+ mul.lo.s32 %r13, %r273, %r272;
+ mad.lo.s32 %r274, %r2, %r9, %r5;
+ shl.b32 %r14, %r274, 4;
+ mul.lo.s32 %r275, %r212, %r9;
+ cvt.s64.s32 %rd54, %r275;
cvt.s64.s32 %rd55, %r8;
add.s64 %rd5, %rd54, %rd55;
- mul.lo.s32 %r279, %r13, %r213;
- cvt.s64.s32 %rd6, %r279;
+ mul.lo.s32 %r276, %r13, %r212;
+ cvt.s64.s32 %rd6, %r276;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- mov.u32 %r280, %tid.z;
- mad.lo.s32 %r281, %r4, %r280, %r9;
- mad.lo.s32 %r15, %r281, %r3, %r5;
+ mov.u32 %r277, %tid.z;
+ mad.lo.s32 %r278, %r4, %r277, %r9;
+ mad.lo.s32 %r15, %r278, %r3, %r5;
mul.wide.u32 %rd56, %r15, 4;
add.s64 %rd7, %rd45, %rd56;
- clz.b32 %r282, %r3;
- mov.u32 %r283, 31;
- sub.s32 %r284, %r283, %r282;
- shl.b32 %r16, %r276, %r284;
+ clz.b32 %r279, %r3;
+ mov.u32 %r280, 31;
+ sub.s32 %r281, %r280, %r279;
+ mov.u32 %r282, 1;
+ shl.b32 %r16, %r282, %r281;
setp.lt.u32 %p15, %r5, %r16;
- add.s32 %r285, %r16, %r5;
- setp.lt.u32 %p16, %r285, %r3;
+ add.s32 %r283, %r16, %r5;
+ setp.lt.u32 %p16, %r283, %r3;
and.pred %p2, %p15, %p16;
- add.s32 %r286, %r15, %r16;
- mul.wide.s32 %rd57, %r286, 4;
+ add.s32 %r284, %r15, %r16;
+ mul.wide.s32 %rd57, %r284, 4;
add.s64 %rd8, %rd45, %rd57;
- shr.u32 %r287, %r16, 31;
- add.s32 %r288, %r16, %r287;
- shr.s32 %r17, %r288, 1;
- add.s32 %r18, %r278, %r8;
+ shr.u32 %r285, %r16, 31;
+ add.s32 %r286, %r16, %r285;
+ shr.s32 %r17, %r286, 1;
+ shl.b32 %r287, %r9, 3;
+ mad.lo.s32 %r288, %r287, %r2, %r8;
add.s64 %rd58, %rd45, %rd51;
- mul.wide.s32 %rd59, %r18, 2;
+ mul.wide.s32 %rd59, %r288, 2;
add.s64 %rd9, %rd58, %rd59;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r8, 2;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r289, %r15, 1;
mul.wide.u32 %rd62, %r289, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd59;
- mul.wide.s32 %rd63, %r281, 4;
+ mul.wide.s32 %rd63, %r278, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd14, %rd46, %rd51;
cvta.to.global.u64 %rd16, %rd36;
cvta.to.global.u64 %rd17, %rd35;
- mov.u32 %r559, 0;
+ mov.u32 %r553, 0;
mov.f32 %f187, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd14; cvt.u32.u64 %r292, smem_ptr; }
- add.s32 %r293, %r14, %r292;
+ add.s32 %r293, %r292, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r295, smem_ptr; }
- add.s32 %r296, %r14, %r295;
+ add.s32 %r296, %r295, %r14;
not.pred %p22, %p2;
mov.f32 %f606, %f187;
mov.f32 %f607, %f187;
mov.f32 %f608, %f187;
mov.f32 %f609, %f187;
@@ -215,30 +215,30 @@
mov.f32 %f620, %f187;
mov.f32 %f621, %f187;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r559, %r4;
- add.s32 %r290, %r23, %r9;
- add.s32 %r24, %r290, %r13;
- setp.gt.s32 %p17, %r24, 215;
+ mul.lo.s32 %r22, %r553, %r4;
+ add.s32 %r290, %r22, %r9;
+ add.s32 %r23, %r290, %r13;
+ setp.gt.s32 %p17, %r23, 215;
mov.f32 %f622, %f187;
@%p17 bra $L__BB0_9;
- mul.lo.s32 %r291, %r24, %r222;
+ mul.lo.s32 %r291, %r23, %r221;
mul.wide.s32 %rd65, %r291, 4;
add.s64 %rd66, %rd17, %rd65;
ld.global.f32 %f622, [%rd66];
$L__BB0_9:
- setp.lt.s32 %p18, %r24, 216;
+ setp.lt.s32 %p18, %r23, 216;
and.pred %p3, %p1, %p18;
not.pred %p19, %p3;
@%p19 bra $L__BB0_11;
- mul.lo.s32 %r557, %r559, %r4;
- mul.lo.s32 %r298, %r557, %r213;
+ mul.lo.s32 %r551, %r553, %r4;
+ mul.lo.s32 %r298, %r551, %r212;
cvt.s64.s32 %rd71, %r298;
add.s64 %rd72, %rd5, %rd71;
add.s64 %rd73, %rd72, %rd6;
shl.b64 %rd74, %rd73, 1;
add.s64 %rd68, %rd34, %rd74;
@@ -260,19 +260,19 @@
}
$L__BB0_11:
- mul.lo.s32 %r555, %r559, %r4;
- add.s32 %r554, %r555, %r9;
- add.s32 %r553, %r554, %r13;
- setp.gt.s32 %p199, %r553, 215;
+ mul.lo.s32 %r550, %r553, %r4;
+ add.s32 %r549, %r550, %r9;
+ add.s32 %r548, %r549, %r13;
+ setp.gt.s32 %p199, %r548, 215;
mov.f32 %f189, 0f00000000;
mov.f32 %f623, %f189;
@%p199 bra $L__BB0_13;
- mul.lo.s32 %r299, %r24, %r226;
+ mul.lo.s32 %r299, %r23, %r225;
mul.wide.s32 %rd75, %r299, 4;
add.s64 %rd76, %rd16, %rd75;
ld.global.f32 %f623, [%rd76];
$L__BB0_13:
@@ -465,11 +465,11 @@
mov.f32 %f641, %f640;
$L__BB0_16:
- shl.b32 %r562, %r562, 2;
+ shl.b32 %r556, %r556, 2;
st.shared.f32 [%rd7], %f641;
bar.sync 0;
@%p22 bra $L__BB0_18;
ld.shared.f32 %f283, [%rd8];
@@ -480,29 +480,29 @@
$L__BB0_18:
setp.lt.s32 %p23, %r16, 4;
bar.sync 0;
@%p23 bra $L__BB0_23;
- mov.u32 %r560, %r17;
+ mov.u32 %r554, %r17;
$L__BB0_20:
- setp.ge.u32 %p24, %r5, %r560;
+ setp.ge.u32 %p24, %r5, %r554;
@%p24 bra $L__BB0_22;
- add.s32 %r328, %r560, %r15;
+ add.s32 %r328, %r554, %r15;
mul.wide.s32 %rd77, %r328, 4;
add.s64 %rd79, %rd45, %rd77;
ld.shared.f32 %f286, [%rd7];
ld.shared.f32 %f287, [%rd79];
add.f32 %f288, %f287, %f286;
st.shared.f32 [%rd7], %f288;
$L__BB0_22:
bar.sync 0;
- shr.u32 %r36, %r560, 1;
- setp.gt.u32 %p25, %r560, 3;
- mov.u32 %r560, %r36;
+ shr.u32 %r35, %r554, 1;
+ setp.gt.u32 %p25, %r554, 3;
+ mov.u32 %r554, %r35;
@%p25 bra $L__BB0_20;
$L__BB0_23:
mov.f32 %f642, 0f00000000;
@%p8 bra $L__BB0_26;
@@ -529,29 +529,29 @@
$L__BB0_28:
setp.lt.s32 %p198, %r16, 4;
bar.sync 0;
@%p198 bra $L__BB0_33;
- mov.u32 %r561, %r17;
+ mov.u32 %r555, %r17;
$L__BB0_30:
- setp.ge.u32 %p30, %r5, %r561;
+ setp.ge.u32 %p30, %r5, %r555;
@%p30 bra $L__BB0_32;
- add.s32 %r329, %r561, %r15;
+ add.s32 %r329, %r555, %r15;
mul.wide.s32 %rd80, %r329, 4;
add.s64 %rd82, %rd45, %rd80;
ld.shared.f32 %f295, [%rd7];
ld.shared.f32 %f296, [%rd82];
add.f32 %f297, %f296, %f295;
st.shared.f32 [%rd7], %f297;
$L__BB0_32:
bar.sync 0;
- shr.u32 %r38, %r561, 1;
- setp.gt.u32 %p31, %r561, 3;
- mov.u32 %r561, %r38;
+ shr.u32 %r37, %r555, 1;
+ setp.gt.u32 %p31, %r555, 3;
+ mov.u32 %r555, %r37;
@%p31 bra $L__BB0_30;
$L__BB0_33:
mov.f32 %f643, 0f00000000;
@%p8 bra $L__BB0_36;
@@ -589,11 +589,10 @@
@%p3 bra $L__BB0_42;
bra.uni $L__BB0_41;
$L__BB0_42:
- mul.lo.s32 %r556, %r559, %r4;
ld.shared.v4.u32 {%r338, %r339, %r340, %r341}, [%rd9];
ld.shared.v4.u32 {%r346, %r347, %r348, %r349}, [%rd10];
ld.shared.v4.u32 {%r354, %r355, %r356, %r357}, [%rd12];
mov.b32 {%rs97, %rs101}, %r346;
@@ -801,13 +800,12 @@
{ cvt.rn.bf16.f32 %rs124, %f365;}
mov.b32 %r337, {%rs124, %rs128};
- add.s32 %r362, %r13, %r556;
- mad.lo.s32 %r363, %r362, %r213, %r18;
- mul.wide.s32 %rd84, %r363, 2;
+ mad.lo.s32 %r362, %r23, %r212, %r8;
+ mul.wide.s32 %rd84, %r362, 2;
add.s64 %rd83, %rd38, %rd84;
st.global.cs.v4.s32 [%rd83], {%r334,%r335,%r336,%r337};
bra.uni $L__BB0_43;
@@ -817,12 +815,12 @@
{ cvt.rn.bf16.f32 %rs61, %f301;}
$L__BB0_43:
- add.s32 %r559, %r559, 1;
- setp.lt.s32 %p37, %r559, %r12;
+ add.s32 %r553, %r553, 1;
+ setp.lt.s32 %p37, %r553, %r12;
@%p37 bra $L__BB0_7;
bra.uni $L__BB0_44;
$L__BB0_5:
mov.f32 %f606, 0f00000000;
@@ -841,68 +839,68 @@
mov.f32 %f619, %f606;
mov.f32 %f620, %f606;
mov.f32 %f621, %f606;
$L__BB0_44:
- mov.u32 %r364, %tid.z;
- mad.lo.s32 %r365, %r4, %r364, %r9;
- mad.lo.s32 %r50, %r365, %r3, %r5;
- mul.wide.u32 %rd85, %r50, 4;
+ mov.u32 %r363, %tid.z;
+ mad.lo.s32 %r364, %r4, %r363, %r9;
+ mad.lo.s32 %r49, %r364, %r3, %r5;
+ mul.wide.u32 %rd85, %r49, 4;
add.s64 %rd23, %rd45, %rd85;
st.shared.f32 [%rd23], %f621;
bar.sync 0;
- clz.b32 %r366, %r4;
- mov.u32 %r367, 31;
- sub.s32 %r51, %r367, %r366;
- mov.u32 %r368, 1;
- shl.b32 %r593, %r368, %r51;
- setp.lt.u32 %p38, %r9, %r593;
- add.s32 %r369, %r593, %r9;
- setp.lt.u32 %p39, %r369, %r4;
+ clz.b32 %r365, %r4;
+ mov.u32 %r366, 31;
+ sub.s32 %r50, %r366, %r365;
+ mov.u32 %r367, 1;
+ shl.b32 %r587, %r367, %r50;
+ setp.lt.u32 %p38, %r9, %r587;
+ add.s32 %r368, %r587, %r9;
+ setp.lt.u32 %p39, %r368, %r4;
and.pred %p4, %p38, %p39;
not.pred %p40, %p4;
@%p40 bra $L__BB0_46;
- shl.b32 %r370, %r3, %r51;
- add.s32 %r371, %r50, %r370;
- mul.wide.s32 %rd87, %r371, 4;
+ shl.b32 %r369, %r3, %r50;
+ add.s32 %r370, %r49, %r369;
+ mul.wide.s32 %rd87, %r370, 4;
add.s64 %rd89, %rd45, %rd87;
ld.shared.f32 %f426, [%rd23];
ld.shared.f32 %f427, [%rd89];
add.f32 %f428, %f427, %f426;
st.shared.f32 [%rd23], %f428;
$L__BB0_46:
bar.sync 0;
- setp.lt.s32 %p41, %r593, 4;
+ setp.lt.s32 %p41, %r587, 4;
@%p41 bra $L__BB0_51;
- mov.u32 %r563, %r593;
+ mov.u32 %r557, %r587;
$L__BB0_48:
- shr.u32 %r54, %r563, 1;
- setp.ge.u32 %p42, %r9, %r54;
+ shr.u32 %r53, %r557, 1;
+ setp.ge.u32 %p42, %r9, %r53;
@%p42 bra $L__BB0_50;
- mad.lo.s32 %r372, %r54, %r3, %r50;
- mul.wide.s32 %rd90, %r372, 4;
+ mad.lo.s32 %r371, %r53, %r3, %r49;
+ mul.wide.s32 %rd90, %r371, 4;
add.s64 %rd92, %rd45, %rd90;
ld.shared.f32 %f429, [%rd23];
ld.shared.f32 %f430, [%rd92];
add.f32 %f431, %f430, %f429;
st.shared.f32 [%rd23], %f431;
$L__BB0_50:
bar.sync 0;
- setp.gt.u32 %p43, %r563, 7;
- mov.u32 %r563, %r54;
+ setp.gt.u32 %p43, %r557, 7;
+ mov.u32 %r557, %r53;
@%p43 bra $L__BB0_48;
$L__BB0_51:
- mov.u32 %r564, 0;
- add.s32 %r374, %r50, %r3;
- mul.wide.u32 %rd93, %r374, 4;
+ mov.u32 %r558, 0;
+ add.s32 %r373, %r49, %r3;
+ mul.wide.u32 %rd93, %r373, 4;
add.s64 %rd24, %rd45, %rd93;
@%p12 bra $L__BB0_55;
ld.shared.f32 %f432, [%rd23];
add.f32 %f660, %f432, 0f00000000;
@@ -911,54 +909,54 @@
ld.shared.f32 %f433, [%rd24];
add.f32 %f660, %f660, %f433;
$L__BB0_54:
- mov.b32 %r564, %f660;
+ mov.b32 %r558, %f660;
$L__BB0_55:
bar.sync 0;
st.shared.f32 [%rd23], %f620;
bar.sync 0;
@%p40 bra $L__BB0_57;
- shl.b32 %r375, %r3, %r51;
- add.s32 %r376, %r50, %r375;
- mul.wide.s32 %rd95, %r376, 4;
+ shl.b32 %r374, %r3, %r50;
+ add.s32 %r375, %r49, %r374;
+ mul.wide.s32 %rd95, %r375, 4;
add.s64 %rd97, %rd45, %rd95;
ld.shared.f32 %f434, [%rd23];
ld.shared.f32 %f435, [%rd97];
add.f32 %f436, %f435, %f434;
st.shared.f32 [%rd23], %f436;
$L__BB0_57:
bar.sync 0;
@%p41 bra $L__BB0_62;
- mov.u32 %r565, %r593;
+ mov.u32 %r559, %r587;
$L__BB0_59:
- shr.u32 %r58, %r565, 1;
- setp.ge.u32 %p48, %r9, %r58;
+ shr.u32 %r57, %r559, 1;
+ setp.ge.u32 %p48, %r9, %r57;
@%p48 bra $L__BB0_61;
- mad.lo.s32 %r377, %r58, %r3, %r50;
- mul.wide.s32 %rd98, %r377, 4;
+ mad.lo.s32 %r376, %r57, %r3, %r49;
+ mul.wide.s32 %rd98, %r376, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f437, [%rd23];
ld.shared.f32 %f438, [%rd100];
add.f32 %f439, %f438, %f437;
st.shared.f32 [%rd23], %f439;
$L__BB0_61:
bar.sync 0;
- setp.gt.u32 %p49, %r565, 7;
- mov.u32 %r565, %r58;
+ setp.gt.u32 %p49, %r559, 7;
+ mov.u32 %r559, %r57;
@%p49 bra $L__BB0_59;
$L__BB0_62:
- mov.u32 %r566, 0;
+ mov.u32 %r560, 0;
@%p12 bra $L__BB0_66;
ld.shared.f32 %f440, [%rd23];
add.f32 %f661, %f440, 0f00000000;
setp.lt.u32 %p51, %r4, 2;
@@ -966,54 +964,54 @@
ld.shared.f32 %f441, [%rd24];
add.f32 %f661, %f661, %f441;
$L__BB0_65:
- mov.b32 %r566, %f661;
+ mov.b32 %r560, %f661;
$L__BB0_66:
bar.sync 0;
st.shared.f32 [%rd23], %f619;
bar.sync 0;
@%p40 bra $L__BB0_68;
- shl.b32 %r379, %r3, %r51;
- add.s32 %r380, %r50, %r379;
- mul.wide.s32 %rd101, %r380, 4;
+ shl.b32 %r378, %r3, %r50;
+ add.s32 %r379, %r49, %r378;
+ mul.wide.s32 %rd101, %r379, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f442, [%rd23];
ld.shared.f32 %f443, [%rd103];
add.f32 %f444, %f443, %f442;
st.shared.f32 [%rd23], %f444;
$L__BB0_68:
bar.sync 0;
@%p41 bra $L__BB0_73;
- mov.u32 %r567, %r593;
+ mov.u32 %r561, %r587;
$L__BB0_70:
- shr.u32 %r62, %r567, 1;
- setp.ge.u32 %p54, %r9, %r62;
+ shr.u32 %r61, %r561, 1;
+ setp.ge.u32 %p54, %r9, %r61;
@%p54 bra $L__BB0_72;
- mad.lo.s32 %r381, %r62, %r3, %r50;
- mul.wide.s32 %rd104, %r381, 4;
+ mad.lo.s32 %r380, %r61, %r3, %r49;
+ mul.wide.s32 %rd104, %r380, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f445, [%rd23];
ld.shared.f32 %f446, [%rd106];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd23], %f447;
$L__BB0_72:
bar.sync 0;
- setp.gt.u32 %p55, %r567, 7;
- mov.u32 %r567, %r62;
+ setp.gt.u32 %p55, %r561, 7;
+ mov.u32 %r561, %r61;
@%p55 bra $L__BB0_70;
$L__BB0_73:
- mov.u32 %r568, 0;
+ mov.u32 %r562, 0;
@%p12 bra $L__BB0_77;
ld.shared.f32 %f448, [%rd23];
add.f32 %f662, %f448, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@@ -1021,54 +1019,54 @@
ld.shared.f32 %f449, [%rd24];
add.f32 %f662, %f662, %f449;
$L__BB0_76:
- mov.b32 %r568, %f662;
+ mov.b32 %r562, %f662;
$L__BB0_77:
bar.sync 0;
st.shared.f32 [%rd23], %f618;
bar.sync 0;
@%p40 bra $L__BB0_79;
- shl.b32 %r383, %r3, %r51;
- add.s32 %r384, %r50, %r383;
- mul.wide.s32 %rd107, %r384, 4;
+ shl.b32 %r382, %r3, %r50;
+ add.s32 %r383, %r49, %r382;
+ mul.wide.s32 %rd107, %r383, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f450, [%rd23];
ld.shared.f32 %f451, [%rd109];
add.f32 %f452, %f451, %f450;
st.shared.f32 [%rd23], %f452;
$L__BB0_79:
bar.sync 0;
@%p41 bra $L__BB0_84;
- mov.u32 %r569, %r593;
+ mov.u32 %r563, %r587;
$L__BB0_81:
- shr.u32 %r66, %r569, 1;
- setp.ge.u32 %p60, %r9, %r66;
+ shr.u32 %r65, %r563, 1;
+ setp.ge.u32 %p60, %r9, %r65;
@%p60 bra $L__BB0_83;
- mad.lo.s32 %r385, %r66, %r3, %r50;
- mul.wide.s32 %rd110, %r385, 4;
+ mad.lo.s32 %r384, %r65, %r3, %r49;
+ mul.wide.s32 %rd110, %r384, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f453, [%rd23];
ld.shared.f32 %f454, [%rd112];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd23], %f455;
$L__BB0_83:
bar.sync 0;
- setp.gt.u32 %p61, %r569, 7;
- mov.u32 %r569, %r66;
+ setp.gt.u32 %p61, %r563, 7;
+ mov.u32 %r563, %r65;
@%p61 bra $L__BB0_81;
$L__BB0_84:
- mov.u32 %r570, 0;
+ mov.u32 %r564, 0;
@%p12 bra $L__BB0_88;
ld.shared.f32 %f456, [%rd23];
add.f32 %f663, %f456, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@@ -1076,54 +1074,54 @@
ld.shared.f32 %f457, [%rd24];
add.f32 %f663, %f663, %f457;
$L__BB0_87:
- mov.b32 %r570, %f663;
+ mov.b32 %r564, %f663;
$L__BB0_88:
bar.sync 0;
st.shared.f32 [%rd23], %f617;
bar.sync 0;
@%p40 bra $L__BB0_90;
- shl.b32 %r387, %r3, %r51;
- add.s32 %r388, %r50, %r387;
- mul.wide.s32 %rd113, %r388, 4;
+ shl.b32 %r386, %r3, %r50;
+ add.s32 %r387, %r49, %r386;
+ mul.wide.s32 %rd113, %r387, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f458, [%rd23];
ld.shared.f32 %f459, [%rd115];
add.f32 %f460, %f459, %f458;
st.shared.f32 [%rd23], %f460;
$L__BB0_90:
bar.sync 0;
@%p41 bra $L__BB0_95;
- mov.u32 %r571, %r593;
+ mov.u32 %r565, %r587;
$L__BB0_92:
- shr.u32 %r70, %r571, 1;
- setp.ge.u32 %p66, %r9, %r70;
+ shr.u32 %r69, %r565, 1;
+ setp.ge.u32 %p66, %r9, %r69;
@%p66 bra $L__BB0_94;
- mad.lo.s32 %r389, %r70, %r3, %r50;
- mul.wide.s32 %rd116, %r389, 4;
+ mad.lo.s32 %r388, %r69, %r3, %r49;
+ mul.wide.s32 %rd116, %r388, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f461, [%rd23];
ld.shared.f32 %f462, [%rd118];
add.f32 %f463, %f462, %f461;
st.shared.f32 [%rd23], %f463;
$L__BB0_94:
bar.sync 0;
- setp.gt.u32 %p67, %r571, 7;
- mov.u32 %r571, %r70;
+ setp.gt.u32 %p67, %r565, 7;
+ mov.u32 %r565, %r69;
@%p67 bra $L__BB0_92;
$L__BB0_95:
- mov.u32 %r572, 0;
+ mov.u32 %r566, 0;
@%p12 bra $L__BB0_99;
ld.shared.f32 %f464, [%rd23];
add.f32 %f664, %f464, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@@ -1131,54 +1129,54 @@
ld.shared.f32 %f465, [%rd24];
add.f32 %f664, %f664, %f465;
$L__BB0_98:
- mov.b32 %r572, %f664;
+ mov.b32 %r566, %f664;
$L__BB0_99:
bar.sync 0;
st.shared.f32 [%rd23], %f616;
bar.sync 0;
@%p40 bra $L__BB0_101;
- shl.b32 %r391, %r3, %r51;
- add.s32 %r392, %r50, %r391;
- mul.wide.s32 %rd119, %r392, 4;
+ shl.b32 %r390, %r3, %r50;
+ add.s32 %r391, %r49, %r390;
+ mul.wide.s32 %rd119, %r391, 4;
add.s64 %rd121, %rd45, %rd119;
ld.shared.f32 %f466, [%rd23];
ld.shared.f32 %f467, [%rd121];
add.f32 %f468, %f467, %f466;
st.shared.f32 [%rd23], %f468;
$L__BB0_101:
bar.sync 0;
@%p41 bra $L__BB0_106;
- mov.u32 %r573, %r593;
+ mov.u32 %r567, %r587;
$L__BB0_103:
- shr.u32 %r74, %r573, 1;
- setp.ge.u32 %p72, %r9, %r74;
+ shr.u32 %r73, %r567, 1;
+ setp.ge.u32 %p72, %r9, %r73;
@%p72 bra $L__BB0_105;
- mad.lo.s32 %r393, %r74, %r3, %r50;
- mul.wide.s32 %rd122, %r393, 4;
+ mad.lo.s32 %r392, %r73, %r3, %r49;
+ mul.wide.s32 %rd122, %r392, 4;
add.s64 %rd124, %rd45, %rd122;
ld.shared.f32 %f469, [%rd23];
ld.shared.f32 %f470, [%rd124];
add.f32 %f471, %f470, %f469;
st.shared.f32 [%rd23], %f471;
$L__BB0_105:
bar.sync 0;
- setp.gt.u32 %p73, %r573, 7;
- mov.u32 %r573, %r74;
+ setp.gt.u32 %p73, %r567, 7;
+ mov.u32 %r567, %r73;
@%p73 bra $L__BB0_103;
$L__BB0_106:
- mov.u32 %r574, 0;
+ mov.u32 %r568, 0;
@%p12 bra $L__BB0_110;
ld.shared.f32 %f472, [%rd23];
add.f32 %f665, %f472, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@@ -1186,54 +1184,54 @@
ld.shared.f32 %f473, [%rd24];
add.f32 %f665, %f665, %f473;
$L__BB0_109:
- mov.b32 %r574, %f665;
+ mov.b32 %r568, %f665;
$L__BB0_110:
bar.sync 0;
st.shared.f32 [%rd23], %f615;
bar.sync 0;
@%p40 bra $L__BB0_112;
- shl.b32 %r395, %r3, %r51;
- add.s32 %r396, %r50, %r395;
- mul.wide.s32 %rd125, %r396, 4;
+ shl.b32 %r394, %r3, %r50;
+ add.s32 %r395, %r49, %r394;
+ mul.wide.s32 %rd125, %r395, 4;
add.s64 %rd127, %rd45, %rd125;
ld.shared.f32 %f474, [%rd23];
ld.shared.f32 %f475, [%rd127];
add.f32 %f476, %f475, %f474;
st.shared.f32 [%rd23], %f476;
$L__BB0_112:
bar.sync 0;
@%p41 bra $L__BB0_117;
- mov.u32 %r575, %r593;
+ mov.u32 %r569, %r587;
$L__BB0_114:
- shr.u32 %r78, %r575, 1;
- setp.ge.u32 %p78, %r9, %r78;
+ shr.u32 %r77, %r569, 1;
+ setp.ge.u32 %p78, %r9, %r77;
@%p78 bra $L__BB0_116;
- mad.lo.s32 %r397, %r78, %r3, %r50;
- mul.wide.s32 %rd128, %r397, 4;
+ mad.lo.s32 %r396, %r77, %r3, %r49;
+ mul.wide.s32 %rd128, %r396, 4;
add.s64 %rd130, %rd45, %rd128;
ld.shared.f32 %f477, [%rd23];
ld.shared.f32 %f478, [%rd130];
add.f32 %f479, %f478, %f477;
st.shared.f32 [%rd23], %f479;
$L__BB0_116:
bar.sync 0;
- setp.gt.u32 %p79, %r575, 7;
- mov.u32 %r575, %r78;
+ setp.gt.u32 %p79, %r569, 7;
+ mov.u32 %r569, %r77;
@%p79 bra $L__BB0_114;
$L__BB0_117:
- mov.u32 %r576, 0;
+ mov.u32 %r570, 0;
@%p12 bra $L__BB0_121;
ld.shared.f32 %f480, [%rd23];
add.f32 %f666, %f480, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@@ -1241,54 +1239,54 @@
ld.shared.f32 %f481, [%rd24];
add.f32 %f666, %f666, %f481;
$L__BB0_120:
- mov.b32 %r576, %f666;
+ mov.b32 %r570, %f666;
$L__BB0_121:
bar.sync 0;
st.shared.f32 [%rd23], %f614;
bar.sync 0;
@%p40 bra $L__BB0_123;
- shl.b32 %r399, %r3, %r51;
- add.s32 %r400, %r50, %r399;
- mul.wide.s32 %rd131, %r400, 4;
+ shl.b32 %r398, %r3, %r50;
+ add.s32 %r399, %r49, %r398;
+ mul.wide.s32 %rd131, %r399, 4;
add.s64 %rd133, %rd45, %rd131;
ld.shared.f32 %f482, [%rd23];
ld.shared.f32 %f483, [%rd133];
add.f32 %f484, %f483, %f482;
st.shared.f32 [%rd23], %f484;
$L__BB0_123:
bar.sync 0;
@%p41 bra $L__BB0_128;
- mov.u32 %r577, %r593;
+ mov.u32 %r571, %r587;
$L__BB0_125:
- shr.u32 %r82, %r577, 1;
- setp.ge.u32 %p84, %r9, %r82;
+ shr.u32 %r81, %r571, 1;
+ setp.ge.u32 %p84, %r9, %r81;
@%p84 bra $L__BB0_127;
- mad.lo.s32 %r401, %r82, %r3, %r50;
- mul.wide.s32 %rd134, %r401, 4;
+ mad.lo.s32 %r400, %r81, %r3, %r49;
+ mul.wide.s32 %rd134, %r400, 4;
add.s64 %rd136, %rd45, %rd134;
ld.shared.f32 %f485, [%rd23];
ld.shared.f32 %f486, [%rd136];
add.f32 %f487, %f486, %f485;
st.shared.f32 [%rd23], %f487;
$L__BB0_127:
bar.sync 0;
- setp.gt.u32 %p85, %r577, 7;
- mov.u32 %r577, %r82;
+ setp.gt.u32 %p85, %r571, 7;
+ mov.u32 %r571, %r81;
@%p85 bra $L__BB0_125;
$L__BB0_128:
- mov.u32 %r578, 0;
+ mov.u32 %r572, 0;
@%p12 bra $L__BB0_132;
ld.shared.f32 %f488, [%rd23];
add.f32 %f667, %f488, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@@ -1296,55 +1294,55 @@
ld.shared.f32 %f489, [%rd24];
add.f32 %f667, %f667, %f489;
$L__BB0_131:
- mov.b32 %r578, %f667;
+ mov.b32 %r572, %f667;
$L__BB0_132:
bar.sync 0;
- shl.b32 %r85, %r562, 4;
+ shl.b32 %r84, %r556, 4;
st.shared.f32 [%rd23], %f613;
bar.sync 0;
@%p40 bra $L__BB0_134;
- shl.b32 %r403, %r3, %r51;
- add.s32 %r404, %r50, %r403;
- mul.wide.s32 %rd137, %r404, 4;
+ shl.b32 %r402, %r3, %r50;
+ add.s32 %r403, %r49, %r402;
+ mul.wide.s32 %rd137, %r403, 4;
add.s64 %rd139, %rd45, %rd137;
ld.shared.f32 %f490, [%rd23];
ld.shared.f32 %f491, [%rd139];
add.f32 %f492, %f491, %f490;
st.shared.f32 [%rd23], %f492;
$L__BB0_134:
bar.sync 0;
@%p41 bra $L__BB0_139;
- mov.u32 %r579, %r593;
+ mov.u32 %r573, %r587;
$L__BB0_136:
- shr.u32 %r87, %r579, 1;
- setp.ge.u32 %p90, %r9, %r87;
+ shr.u32 %r86, %r573, 1;
+ setp.ge.u32 %p90, %r9, %r86;
@%p90 bra $L__BB0_138;
- mad.lo.s32 %r405, %r87, %r3, %r50;
- mul.wide.s32 %rd140, %r405, 4;
+ mad.lo.s32 %r404, %r86, %r3, %r49;
+ mul.wide.s32 %rd140, %r404, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f493, [%rd23];
ld.shared.f32 %f494, [%rd142];
add.f32 %f495, %f494, %f493;
st.shared.f32 [%rd23], %f495;
$L__BB0_138:
bar.sync 0;
- setp.gt.u32 %p91, %r579, 7;
- mov.u32 %r579, %r87;
+ setp.gt.u32 %p91, %r573, 7;
+ mov.u32 %r573, %r86;
@%p91 bra $L__BB0_136;
$L__BB0_139:
- mov.u32 %r580, 0;
+ mov.u32 %r574, 0;
@%p12 bra $L__BB0_143;
ld.shared.f32 %f496, [%rd23];
add.f32 %f668, %f496, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@@ -1352,54 +1350,54 @@
ld.shared.f32 %f497, [%rd24];
add.f32 %f668, %f668, %f497;
$L__BB0_142:
- mov.b32 %r580, %f668;
+ mov.b32 %r574, %f668;
$L__BB0_143:
bar.sync 0;
st.shared.f32 [%rd23], %f612;
bar.sync 0;
@%p40 bra $L__BB0_145;
- shl.b32 %r407, %r3, %r51;
- add.s32 %r408, %r50, %r407;
- mul.wide.s32 %rd143, %r408, 4;
+ shl.b32 %r406, %r3, %r50;
+ add.s32 %r407, %r49, %r406;
+ mul.wide.s32 %rd143, %r407, 4;
add.s64 %rd145, %rd45, %rd143;
ld.shared.f32 %f498, [%rd23];
ld.shared.f32 %f499, [%rd145];
add.f32 %f500, %f499, %f498;
st.shared.f32 [%rd23], %f500;
$L__BB0_145:
bar.sync 0;
@%p41 bra $L__BB0_150;
- mov.u32 %r581, %r593;
+ mov.u32 %r575, %r587;
$L__BB0_147:
- shr.u32 %r91, %r581, 1;
- setp.ge.u32 %p96, %r9, %r91;
+ shr.u32 %r90, %r575, 1;
+ setp.ge.u32 %p96, %r9, %r90;
@%p96 bra $L__BB0_149;
- mad.lo.s32 %r409, %r91, %r3, %r50;
- mul.wide.s32 %rd146, %r409, 4;
+ mad.lo.s32 %r408, %r90, %r3, %r49;
+ mul.wide.s32 %rd146, %r408, 4;
add.s64 %rd148, %rd45, %rd146;
ld.shared.f32 %f501, [%rd23];
ld.shared.f32 %f502, [%rd148];
add.f32 %f503, %f502, %f501;
st.shared.f32 [%rd23], %f503;
$L__BB0_149:
bar.sync 0;
- setp.gt.u32 %p97, %r581, 7;
- mov.u32 %r581, %r91;
+ setp.gt.u32 %p97, %r575, 7;
+ mov.u32 %r575, %r90;
@%p97 bra $L__BB0_147;
$L__BB0_150:
- mov.u32 %r582, 0;
+ mov.u32 %r576, 0;
@%p12 bra $L__BB0_154;
ld.shared.f32 %f504, [%rd23];
add.f32 %f669, %f504, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@@ -1407,54 +1405,54 @@
ld.shared.f32 %f505, [%rd24];
add.f32 %f669, %f669, %f505;
$L__BB0_153:
- mov.b32 %r582, %f669;
+ mov.b32 %r576, %f669;
$L__BB0_154:
bar.sync 0;
st.shared.f32 [%rd23], %f611;
bar.sync 0;
@%p40 bra $L__BB0_156;
- shl.b32 %r411, %r3, %r51;
- add.s32 %r412, %r50, %r411;
- mul.wide.s32 %rd149, %r412, 4;
+ shl.b32 %r410, %r3, %r50;
+ add.s32 %r411, %r49, %r410;
+ mul.wide.s32 %rd149, %r411, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f506, [%rd23];
ld.shared.f32 %f507, [%rd151];
add.f32 %f508, %f507, %f506;
st.shared.f32 [%rd23], %f508;
$L__BB0_156:
bar.sync 0;
@%p41 bra $L__BB0_161;
- mov.u32 %r583, %r593;
+ mov.u32 %r577, %r587;
$L__BB0_158:
- shr.u32 %r95, %r583, 1;
- setp.ge.u32 %p102, %r9, %r95;
+ shr.u32 %r94, %r577, 1;
+ setp.ge.u32 %p102, %r9, %r94;
@%p102 bra $L__BB0_160;
- mad.lo.s32 %r413, %r95, %r3, %r50;
- mul.wide.s32 %rd152, %r413, 4;
+ mad.lo.s32 %r412, %r94, %r3, %r49;
+ mul.wide.s32 %rd152, %r412, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f509, [%rd23];
ld.shared.f32 %f510, [%rd154];
add.f32 %f511, %f510, %f509;
st.shared.f32 [%rd23], %f511;
$L__BB0_160:
bar.sync 0;
- setp.gt.u32 %p103, %r583, 7;
- mov.u32 %r583, %r95;
+ setp.gt.u32 %p103, %r577, 7;
+ mov.u32 %r577, %r94;
@%p103 bra $L__BB0_158;
$L__BB0_161:
- mov.u32 %r584, 0;
+ mov.u32 %r578, 0;
@%p12 bra $L__BB0_165;
ld.shared.f32 %f512, [%rd23];
add.f32 %f670, %f512, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@@ -1462,54 +1460,54 @@
ld.shared.f32 %f513, [%rd24];
add.f32 %f670, %f670, %f513;
$L__BB0_164:
- mov.b32 %r584, %f670;
+ mov.b32 %r578, %f670;
$L__BB0_165:
bar.sync 0;
st.shared.f32 [%rd23], %f610;
bar.sync 0;
@%p40 bra $L__BB0_167;
- shl.b32 %r415, %r3, %r51;
- add.s32 %r416, %r50, %r415;
- mul.wide.s32 %rd155, %r416, 4;
+ shl.b32 %r414, %r3, %r50;
+ add.s32 %r415, %r49, %r414;
+ mul.wide.s32 %rd155, %r415, 4;
add.s64 %rd157, %rd45, %rd155;
ld.shared.f32 %f514, [%rd23];
ld.shared.f32 %f515, [%rd157];
add.f32 %f516, %f515, %f514;
st.shared.f32 [%rd23], %f516;
$L__BB0_167:
bar.sync 0;
@%p41 bra $L__BB0_172;
- mov.u32 %r585, %r593;
+ mov.u32 %r579, %r587;
$L__BB0_169:
- shr.u32 %r99, %r585, 1;
- setp.ge.u32 %p108, %r9, %r99;
+ shr.u32 %r98, %r579, 1;
+ setp.ge.u32 %p108, %r9, %r98;
@%p108 bra $L__BB0_171;
- mad.lo.s32 %r417, %r99, %r3, %r50;
- mul.wide.s32 %rd158, %r417, 4;
+ mad.lo.s32 %r416, %r98, %r3, %r49;
+ mul.wide.s32 %rd158, %r416, 4;
add.s64 %rd160, %rd45, %rd158;
ld.shared.f32 %f517, [%rd23];
ld.shared.f32 %f518, [%rd160];
add.f32 %f519, %f518, %f517;
st.shared.f32 [%rd23], %f519;
$L__BB0_171:
bar.sync 0;
- setp.gt.u32 %p109, %r585, 7;
- mov.u32 %r585, %r99;
+ setp.gt.u32 %p109, %r579, 7;
+ mov.u32 %r579, %r98;
@%p109 bra $L__BB0_169;
$L__BB0_172:
- mov.u32 %r586, 0;
+ mov.u32 %r580, 0;
@%p12 bra $L__BB0_176;
ld.shared.f32 %f520, [%rd23];
add.f32 %f671, %f520, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@@ -1517,54 +1515,54 @@
ld.shared.f32 %f521, [%rd24];
add.f32 %f671, %f671, %f521;
$L__BB0_175:
- mov.b32 %r586, %f671;
+ mov.b32 %r580, %f671;
$L__BB0_176:
bar.sync 0;
st.shared.f32 [%rd23], %f609;
bar.sync 0;
@%p40 bra $L__BB0_178;
- shl.b32 %r419, %r3, %r51;
- add.s32 %r420, %r50, %r419;
- mul.wide.s32 %rd161, %r420, 4;
+ shl.b32 %r418, %r3, %r50;
+ add.s32 %r419, %r49, %r418;
+ mul.wide.s32 %rd161, %r419, 4;
add.s64 %rd163, %rd45, %rd161;
ld.shared.f32 %f522, [%rd23];
ld.shared.f32 %f523, [%rd163];
add.f32 %f524, %f523, %f522;
st.shared.f32 [%rd23], %f524;
$L__BB0_178:
bar.sync 0;
@%p41 bra $L__BB0_183;
- mov.u32 %r587, %r593;
+ mov.u32 %r581, %r587;
$L__BB0_180:
- shr.u32 %r103, %r587, 1;
- setp.ge.u32 %p114, %r9, %r103;
+ shr.u32 %r102, %r581, 1;
+ setp.ge.u32 %p114, %r9, %r102;
@%p114 bra $L__BB0_182;
- mad.lo.s32 %r421, %r103, %r3, %r50;
- mul.wide.s32 %rd164, %r421, 4;
+ mad.lo.s32 %r420, %r102, %r3, %r49;
+ mul.wide.s32 %rd164, %r420, 4;
add.s64 %rd166, %rd45, %rd164;
ld.shared.f32 %f525, [%rd23];
ld.shared.f32 %f526, [%rd166];
add.f32 %f527, %f526, %f525;
st.shared.f32 [%rd23], %f527;
$L__BB0_182:
bar.sync 0;
- setp.gt.u32 %p115, %r587, 7;
- mov.u32 %r587, %r103;
+ setp.gt.u32 %p115, %r581, 7;
+ mov.u32 %r581, %r102;
@%p115 bra $L__BB0_180;
$L__BB0_183:
- mov.u32 %r588, 0;
+ mov.u32 %r582, 0;
@%p12 bra $L__BB0_187;
ld.shared.f32 %f528, [%rd23];
add.f32 %f672, %f528, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@@ -1572,54 +1570,54 @@
ld.shared.f32 %f529, [%rd24];
add.f32 %f672, %f672, %f529;
$L__BB0_186:
- mov.b32 %r588, %f672;
+ mov.b32 %r582, %f672;
$L__BB0_187:
bar.sync 0;
st.shared.f32 [%rd23], %f608;
bar.sync 0;
@%p40 bra $L__BB0_189;
- shl.b32 %r423, %r3, %r51;
- add.s32 %r424, %r50, %r423;
- mul.wide.s32 %rd167, %r424, 4;
+ shl.b32 %r422, %r3, %r50;
+ add.s32 %r423, %r49, %r422;
+ mul.wide.s32 %rd167, %r423, 4;
add.s64 %rd169, %rd45, %rd167;
ld.shared.f32 %f530, [%rd23];
ld.shared.f32 %f531, [%rd169];
add.f32 %f532, %f531, %f530;
st.shared.f32 [%rd23], %f532;
$L__BB0_189:
bar.sync 0;
@%p41 bra $L__BB0_194;
- mov.u32 %r589, %r593;
+ mov.u32 %r583, %r587;
$L__BB0_191:
- shr.u32 %r107, %r589, 1;
- setp.ge.u32 %p120, %r9, %r107;
+ shr.u32 %r106, %r583, 1;
+ setp.ge.u32 %p120, %r9, %r106;
@%p120 bra $L__BB0_193;
- mad.lo.s32 %r425, %r107, %r3, %r50;
- mul.wide.s32 %rd170, %r425, 4;
+ mad.lo.s32 %r424, %r106, %r3, %r49;
+ mul.wide.s32 %rd170, %r424, 4;
add.s64 %rd172, %rd45, %rd170;
ld.shared.f32 %f533, [%rd23];
ld.shared.f32 %f534, [%rd172];
add.f32 %f535, %f534, %f533;
st.shared.f32 [%rd23], %f535;
$L__BB0_193:
bar.sync 0;
- setp.gt.u32 %p121, %r589, 7;
- mov.u32 %r589, %r107;
+ setp.gt.u32 %p121, %r583, 7;
+ mov.u32 %r583, %r106;
@%p121 bra $L__BB0_191;
$L__BB0_194:
- mov.u32 %r590, 0;
+ mov.u32 %r584, 0;
@%p12 bra $L__BB0_198;
ld.shared.f32 %f536, [%rd23];
add.f32 %f673, %f536, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@@ -1627,54 +1625,54 @@
ld.shared.f32 %f537, [%rd24];
add.f32 %f673, %f673, %f537;
$L__BB0_197:
- mov.b32 %r590, %f673;
+ mov.b32 %r584, %f673;
$L__BB0_198:
bar.sync 0;
st.shared.f32 [%rd23], %f607;
bar.sync 0;
@%p40 bra $L__BB0_200;
- shl.b32 %r427, %r3, %r51;
- add.s32 %r428, %r50, %r427;
- mul.wide.s32 %rd173, %r428, 4;
+ shl.b32 %r426, %r3, %r50;
+ add.s32 %r427, %r49, %r426;
+ mul.wide.s32 %rd173, %r427, 4;
add.s64 %rd175, %rd45, %rd173;
ld.shared.f32 %f538, [%rd23];
ld.shared.f32 %f539, [%rd175];
add.f32 %f540, %f539, %f538;
st.shared.f32 [%rd23], %f540;
$L__BB0_200:
bar.sync 0;
@%p41 bra $L__BB0_205;
- mov.u32 %r591, %r593;
+ mov.u32 %r585, %r587;
$L__BB0_202:
- shr.u32 %r111, %r591, 1;
- setp.ge.u32 %p126, %r9, %r111;
+ shr.u32 %r110, %r585, 1;
+ setp.ge.u32 %p126, %r9, %r110;
@%p126 bra $L__BB0_204;
- mad.lo.s32 %r429, %r111, %r3, %r50;
- mul.wide.s32 %rd176, %r429, 4;
+ mad.lo.s32 %r428, %r110, %r3, %r49;
+ mul.wide.s32 %rd176, %r428, 4;
add.s64 %rd178, %rd45, %rd176;
ld.shared.f32 %f541, [%rd23];
ld.shared.f32 %f542, [%rd178];
add.f32 %f543, %f542, %f541;
st.shared.f32 [%rd23], %f543;
$L__BB0_204:
bar.sync 0;
- setp.gt.u32 %p127, %r591, 7;
- mov.u32 %r591, %r111;
+ setp.gt.u32 %p127, %r585, 7;
+ mov.u32 %r585, %r110;
@%p127 bra $L__BB0_202;
$L__BB0_205:
- mov.u32 %r592, 0;
+ mov.u32 %r586, 0;
@%p12 bra $L__BB0_209;
ld.shared.f32 %f544, [%rd23];
add.f32 %f674, %f544, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@@ -1682,21 +1680,21 @@
ld.shared.f32 %f545, [%rd24];
add.f32 %f674, %f674, %f545;
$L__BB0_208:
- mov.b32 %r592, %f674;
+ mov.b32 %r586, %f674;
$L__BB0_209:
bar.sync 0;
st.shared.f32 [%rd23], %f606;
bar.sync 0;
@%p40 bra $L__BB0_211;
- shl.b32 %r431, %r3, %r51;
- add.s32 %r432, %r50, %r431;
- mul.wide.s32 %rd179, %r432, 4;
+ shl.b32 %r430, %r3, %r50;
+ add.s32 %r431, %r49, %r430;
+ mul.wide.s32 %rd179, %r431, 4;
add.s64 %rd181, %rd45, %rd179;
ld.shared.f32 %f546, [%rd23];
ld.shared.f32 %f547, [%rd181];
add.f32 %f548, %f547, %f546;
st.shared.f32 [%rd23], %f548;
@@ -1704,30 +1702,30 @@
$L__BB0_211:
bar.sync 0;
@%p41 bra $L__BB0_215;
$L__BB0_212:
- shr.u32 %r115, %r593, 1;
- setp.ge.u32 %p132, %r9, %r115;
+ shr.u32 %r114, %r587, 1;
+ setp.ge.u32 %p132, %r9, %r114;
@%p132 bra $L__BB0_214;
- mad.lo.s32 %r433, %r115, %r3, %r50;
- mul.wide.s32 %rd182, %r433, 4;
+ mad.lo.s32 %r432, %r114, %r3, %r49;
+ mul.wide.s32 %rd182, %r432, 4;
add.s64 %rd184, %rd45, %rd182;
ld.shared.f32 %f549, [%rd23];
ld.shared.f32 %f550, [%rd184];
add.f32 %f551, %f550, %f549;
st.shared.f32 [%rd23], %f551;
$L__BB0_214:
bar.sync 0;
- setp.gt.u32 %p133, %r593, 7;
- mov.u32 %r593, %r115;
+ setp.gt.u32 %p133, %r587, 7;
+ mov.u32 %r587, %r114;
@%p133 bra $L__BB0_212;
$L__BB0_215:
- mov.u32 %r594, 0;
+ mov.u32 %r588, 0;
@%p12 bra $L__BB0_219;
ld.shared.f32 %f552, [%rd23];
add.f32 %f675, %f552, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@@ -1735,275 +1733,271 @@
ld.shared.f32 %f553, [%rd24];
add.f32 %f675, %f675, %f553;
$L__BB0_218:
- mov.b32 %r594, %f675;
+ mov.b32 %r588, %f675;
$L__BB0_219:
bar.sync 0;
@%p1 bra $L__BB0_224;
bra.uni $L__BB0_220;
$L__BB0_224:
@%p12 bra $L__BB0_226;
- shl.b32 %r552, %r5, 3;
- mov.u32 %r459, %ctaid.y;
- mad.lo.s32 %r460, %r213, %r459, %r552;
- add.s32 %r461, %r460, %r85;
- mul.wide.s32 %rd191, %r461, 4;
+ mov.u32 %r458, %ctaid.y;
+ mad.lo.s32 %r459, %r212, %r458, %r8;
+ add.s32 %r460, %r459, %r84;
+ mul.wide.s32 %rd191, %r460, 4;
add.s64 %rd189, %rd41, %rd191;
- st.volatile.global.v4.s32 [%rd189], {%r564,%r566,%r568,%r570};
-
- add.s32 %r462, %r461, 4;
- mul.wide.s32 %rd192, %r462, 4;
+ st.volatile.global.v4.s32 [%rd189], {%r558,%r560,%r562,%r564};
+
+ add.s32 %r461, %r460, 4;
+ mul.wide.s32 %rd192, %r461, 4;
add.s64 %rd190, %rd41, %rd192;
- st.volatile.global.v4.s32 [%rd190], {%r572,%r574,%r576,%r578};
+ st.volatile.global.v4.s32 [%rd190], {%r566,%r568,%r570,%r572};
bra.uni $L__BB0_226;
$L__BB0_220:
- shl.b32 %r549, %r5, 3;
setp.eq.s32 %p136, %r9, 0;
and.pred %p5, %p136, %p10;
not.pred %p138, %p5;
- add.s32 %r435, %r549, 3;
- sub.s32 %r118, %r435, %r213;
- mov.u32 %r436, %ctaid.y;
- mad.lo.s32 %r119, %r213, %r436, %r549;
- neg.s32 %r437, %r85;
- setp.ge.s32 %p139, %r118, %r437;
+ add.s32 %r434, %r8, 3;
+ sub.s32 %r117, %r434, %r212;
+ mov.u32 %r435, %ctaid.y;
+ mad.lo.s32 %r118, %r212, %r435, %r8;
+ neg.s32 %r436, %r84;
+ setp.ge.s32 %p139, %r117, %r436;
or.pred %p140, %p138, %p139;
@%p140 bra $L__BB0_222;
- add.s32 %r442, %r119, %r85;
- mul.wide.s32 %rd186, %r442, 4;
+ add.s32 %r441, %r118, %r84;
+ mul.wide.s32 %rd186, %r441, 4;
add.s64 %rd185, %rd41, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r564,%r566,%r568,%r570};
+ st.volatile.global.v4.s32 [%rd185], {%r558,%r560,%r562,%r564};
$L__BB0_222:
- mov.u32 %r443, -4;
- sub.s32 %r444, %r443, %r85;
- setp.ge.s32 %p141, %r118, %r444;
+ mov.u32 %r442, -4;
+ sub.s32 %r443, %r442, %r84;
+ setp.ge.s32 %p141, %r117, %r443;
or.pred %p143, %p138, %p141;
@%p143 bra $L__BB0_226;
- add.s32 %r449, %r119, %r85;
- add.s32 %r450, %r449, 4;
- mul.wide.s32 %rd188, %r450, 4;
+ add.s32 %r448, %r118, %r84;
+ add.s32 %r449, %r448, 4;
+ mul.wide.s32 %rd188, %r449, 4;
add.s64 %rd187, %rd41, %rd188;
- st.volatile.global.v4.s32 [%rd187], {%r572,%r574,%r576,%r578};
+ st.volatile.global.v4.s32 [%rd187], {%r566,%r568,%r570,%r572};
$L__BB0_226:
@%p1 bra $L__BB0_233;
bra.uni $L__BB0_227;
$L__BB0_233:
@%p12 bra $L__BB0_235;
- shl.b32 %r551, %r5, 3;
- shl.b32 %r487, %r562, 5;
- mov.u32 %r488, %ctaid.y;
- mad.lo.s32 %r489, %r213, %r488, %r551;
- add.s32 %r490, %r489, %r487;
- mul.wide.s32 %rd199, %r490, 4;
+ shl.b32 %r486, %r556, 5;
+ mov.u32 %r487, %ctaid.y;
+ mad.lo.s32 %r488, %r212, %r487, %r8;
+ add.s32 %r489, %r488, %r486;
+ mul.wide.s32 %rd199, %r489, 4;
add.s64 %rd197, %rd42, %rd199;
- st.volatile.global.v4.s32 [%rd197], {%r580,%r582,%r584,%r586};
-
- add.s32 %r491, %r490, 4;
- mul.wide.s32 %rd200, %r491, 4;
+ st.volatile.global.v4.s32 [%rd197], {%r574,%r576,%r578,%r580};
+
+ add.s32 %r490, %r489, 4;
+ mul.wide.s32 %rd200, %r490, 4;
add.s64 %rd198, %rd42, %rd200;
- st.volatile.global.v4.s32 [%rd198], {%r588,%r590,%r592,%r594};
+ st.volatile.global.v4.s32 [%rd198], {%r582,%r584,%r586,%r588};
bra.uni $L__BB0_235;
$L__BB0_227:
- shl.b32 %r550, %r5, 3;
setp.eq.s32 %p145, %r9, 0;
and.pred %p6, %p145, %p10;
- add.s32 %r463, %r550, 3;
- sub.s32 %r120, %r463, %r213;
- mov.u32 %r464, %ctaid.y;
- mad.lo.s32 %r121, %r213, %r464, %r550;
+ add.s32 %r462, %r8, 3;
+ sub.s32 %r119, %r462, %r212;
+ mov.u32 %r463, %ctaid.y;
+ mad.lo.s32 %r120, %r212, %r463, %r8;
not.pred %p147, %p6;
@%p147 bra $L__BB0_230;
- shl.b32 %r122, %r562, 5;
- neg.s32 %r465, %r122;
- setp.ge.s32 %p148, %r120, %r465;
+ shl.b32 %r121, %r556, 5;
+ neg.s32 %r464, %r121;
+ setp.ge.s32 %p148, %r119, %r464;
@%p148 bra $L__BB0_230;
- add.s32 %r470, %r121, %r122;
- mul.wide.s32 %rd194, %r470, 4;
+ add.s32 %r469, %r120, %r121;
+ mul.wide.s32 %rd194, %r469, 4;
add.s64 %rd193, %rd42, %rd194;
- st.volatile.global.v4.s32 [%rd193], {%r580,%r582,%r584,%r586};
+ st.volatile.global.v4.s32 [%rd193], {%r574,%r576,%r578,%r580};
$L__BB0_230:
@%p147 bra $L__BB0_235;
- shl.b32 %r123, %r562, 5;
- mov.u32 %r471, -4;
- sub.s32 %r472, %r471, %r123;
- setp.ge.s32 %p150, %r120, %r472;
+ shl.b32 %r122, %r556, 5;
+ mov.u32 %r470, -4;
+ sub.s32 %r471, %r470, %r122;
+ setp.ge.s32 %p150, %r119, %r471;
@%p150 bra $L__BB0_235;
- add.s32 %r477, %r121, %r123;
- add.s32 %r478, %r477, 4;
- mul.wide.s32 %rd196, %r478, 4;
+ add.s32 %r476, %r120, %r122;
+ add.s32 %r477, %r476, 4;
+ mul.wide.s32 %rd196, %r477, 4;
add.s64 %rd195, %rd42, %rd196;
- st.volatile.global.v4.s32 [%rd195], {%r588,%r590,%r592,%r594};
+ st.volatile.global.v4.s32 [%rd195], {%r582,%r584,%r586,%r588};
$L__BB0_235:
- mov.u32 %r124, %ctaid.y;
+ mov.u32 %r123, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r492, %r5, %r9;
- or.b32 %r494, %r492, %r364;
- setp.ne.s32 %p152, %r494, 0;
+ or.b32 %r491, %r5, %r9;
+ or.b32 %r493, %r491, %r363;
+ setp.ne.s32 %p152, %r493, 0;
@%p152 bra $L__BB0_239;
ld.param.u64 %rd236, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd201, %rd236;
- mov.u32 %r495, %ctaid.x;
- mov.u32 %r496, %ctaid.z;
- mov.u32 %r497, %nctaid.x;
- mad.lo.s32 %r498, %r496, %r497, %r495;
- mul.wide.s32 %rd202, %r498, 8;
+ mov.u32 %r494, %ctaid.x;
+ mov.u32 %r495, %ctaid.z;
+ mov.u32 %r496, %nctaid.x;
+ mad.lo.s32 %r497, %r495, %r496, %r494;
+ mul.wide.s32 %rd202, %r497, 8;
add.s64 %rd27, %rd201, %rd202;
- add.s32 %r499, %r11, -1;
- setp.eq.s32 %p153, %r124, %r499;
+ add.s32 %r498, %r11, -1;
+ setp.eq.s32 %p153, %r123, %r498;
cvt.s64.s32 %rd203, %r11;
mov.u64 %rd204, -9223372036854775807;
sub.s64 %rd205, %rd204, %rd203;
selp.b64 %rd206, %rd205, 1, %p153;
atom.global.add.u64 %rd28, [%rd27], %rd206;
ld.volatile.global.u64 %rd207, [%rd27];
xor.b64 %rd208, %rd207, %rd28;
setp.lt.s64 %p154, %rd208, 0;
@%p154 bra $L__BB0_239;
- mov.u32 %r595, 8;
+ mov.u32 %r589, 8;
$L__BB0_238:
- nanosleep.u32 %r595;
-
- setp.lt.u32 %p155, %r595, 256;
- selp.u32 %r502, 1, 0, %p155;
- shl.b32 %r595, %r595, %r502;
+ nanosleep.u32 %r589;
+
+ setp.lt.u32 %p155, %r589, 256;
+ selp.u32 %r501, 1, 0, %p155;
+ shl.b32 %r589, %r589, %r501;
ld.volatile.global.u64 %rd209, [%rd27];
xor.b64 %rd210, %rd209, %rd28;
setp.gt.s64 %p156, %rd210, -1;
@%p156 bra $L__BB0_238;
$L__BB0_239:
ld.param.u64 %rd235, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd234, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
- mov.u32 %r504, 1;
- add.s32 %r505, %r213, 1;
- shr.u32 %r506, %r505, 31;
- add.s32 %r507, %r505, %r506;
- shr.s32 %r508, %r507, 1;
- add.s32 %r509, %r4, %r508;
- add.s32 %r510, %r509, -1;
- div.s32 %r511, %r510, %r4;
- add.s32 %r512, %r11, -1;
- add.s32 %r513, %r512, %r511;
- div.s32 %r127, %r513, %r11;
- add.s32 %r128, %r512, %r3;
- shl.b32 %r129, %r9, 1;
- shl.b32 %r514, %r4, 1;
- mad.lo.s32 %r132, %r514, %r124, %r129;
- or.b32 %r130, %r132, 1;
- mul.lo.s32 %r131, %r514, %r11;
- clz.b32 %r515, %r3;
- mov.u32 %r516, 31;
- sub.s32 %r517, %r516, %r515;
- shl.b32 %r133, %r504, %r517;
- setp.lt.u32 %p157, %r5, %r133;
- add.s32 %r518, %r133, %r5;
- setp.lt.u32 %p158, %r518, %r3;
+ mov.u32 %r503, 1;
+ add.s32 %r504, %r212, 1;
+ shr.u32 %r505, %r504, 31;
+ add.s32 %r506, %r504, %r505;
+ shr.s32 %r507, %r506, 1;
+ add.s32 %r508, %r4, %r507;
+ add.s32 %r509, %r508, -1;
+ div.s32 %r510, %r509, %r4;
+ add.s32 %r511, %r11, -1;
+ add.s32 %r512, %r511, %r510;
+ div.s32 %r126, %r512, %r11;
+ add.s32 %r127, %r511, %r3;
+ shl.b32 %r128, %r9, 1;
+ shl.b32 %r513, %r4, 1;
+ mad.lo.s32 %r131, %r513, %r123, %r128;
+ or.b32 %r129, %r131, 1;
+ mul.lo.s32 %r130, %r513, %r11;
+ clz.b32 %r514, %r3;
+ mov.u32 %r515, 31;
+ sub.s32 %r516, %r515, %r514;
+ shl.b32 %r132, %r503, %r516;
+ setp.lt.u32 %p157, %r5, %r132;
+ add.s32 %r517, %r132, %r5;
+ setp.lt.u32 %p158, %r517, %r3;
and.pred %p7, %p157, %p158;
- add.s32 %r519, %r50, %r133;
- mul.wide.s32 %rd211, %r519, 4;
+ add.s32 %r518, %r49, %r132;
+ mul.wide.s32 %rd211, %r518, 4;
add.s64 %rd29, %rd45, %rd211;
- shr.u32 %r520, %r133, 31;
- add.s32 %r521, %r133, %r520;
- shr.s32 %r134, %r521, 1;
- add.s32 %r522, %r50, 1;
- mul.wide.u32 %rd213, %r522, 4;
+ shr.u32 %r519, %r132, 31;
+ add.s32 %r520, %r132, %r519;
+ shr.s32 %r133, %r520, 1;
+ add.s32 %r521, %r49, 1;
+ mul.wide.u32 %rd213, %r521, 4;
add.s64 %rd30, %rd45, %rd213;
cvta.to.global.u64 %rd31, %rd234;
cvta.to.global.u64 %rd32, %rd235;
- mov.u32 %r596, 0;
+ mov.u32 %r590, 0;
not.pred %p184, %p7;
bra.uni $L__BB0_240;
$L__BB0_303:
- add.s32 %r596, %r596, 1;
+ add.s32 %r590, %r590, 1;
$L__BB0_240:
.pragma "nounroll";
- setp.lt.s32 %p159, %r596, %r127;
+ setp.lt.s32 %p159, %r590, %r126;
@%p159 bra $L__BB0_274;
bra.uni $L__BB0_241;
$L__BB0_274:
- div.s32 %r160, %r128, %r3;
- setp.lt.s32 %p180, %r160, 1;
+ div.s32 %r159, %r127, %r3;
+ setp.lt.s32 %p180, %r159, 1;
mov.f32 %f684, 0f00000000;
mov.f32 %f685, %f684;
@%p180 bra $L__BB0_280;
- mul.lo.s32 %r537, %r131, %r596;
- add.s32 %r161, %r130, %r537;
- add.s32 %r162, %r132, %r537;
- mov.u32 %r536, 0;
+ mul.lo.s32 %r536, %r130, %r590;
+ add.s32 %r160, %r129, %r536;
+ add.s32 %r161, %r131, %r536;
+ mov.u32 %r535, 0;
mov.f32 %f684, 0f00000000;
- mov.u32 %r605, %r536;
+ mov.u32 %r599, %r535;
$L__BB0_276:
.pragma "nounroll";
- setp.ge.s32 %p181, %r161, %r213;
- mov.u32 %r606, %r536;
- mov.u32 %r607, %r536;
+ setp.ge.s32 %p181, %r160, %r212;
+ mov.u32 %r600, %r535;
+ mov.u32 %r601, %r535;
@%p181 bra $L__BB0_279;
- mad.lo.s32 %r164, %r605, %r3, %r5;
- setp.ge.s32 %p182, %r164, %r11;
- mov.u32 %r606, %r536;
- mov.u32 %r607, %r536;
+ mad.lo.s32 %r163, %r599, %r3, %r5;
+ setp.ge.s32 %p182, %r163, %r11;
+ mov.u32 %r600, %r535;
+ mov.u32 %r601, %r535;
@%p182 bra $L__BB0_279;
- mad.lo.s32 %r544, %r164, %r213, %r162;
- mul.wide.s32 %rd225, %r544, 4;
+ mad.lo.s32 %r543, %r163, %r212, %r161;
+ mul.wide.s32 %rd225, %r543, 4;
add.s64 %rd224, %rd41, %rd225;
- ld.volatile.global.v2.s32 {%r607,%r606}, [%rd224];
+ ld.volatile.global.v2.s32 {%r601,%r600}, [%rd224];
$L__BB0_279:
- mov.b32 %f584, %r607;
+ mov.b32 %f584, %r601;
add.f32 %f685, %f685, %f584;
- mov.b32 %f585, %r606;
+ mov.b32 %f585, %r600;
add.f32 %f684, %f684, %f585;
- add.s32 %r605, %r605, 1;
- setp.lt.s32 %p183, %r605, %r160;
+ add.s32 %r599, %r599, 1;
+ setp.lt.s32 %p183, %r599, %r159;
@%p183 bra $L__BB0_276;
$L__BB0_280:
st.shared.f32 [%rd23], %f685;
bar.sync 0;
@@ -2013,33 +2007,33 @@
ld.shared.f32 %f587, [%rd23];
add.f32 %f588, %f586, %f587;
st.shared.f32 [%rd23], %f588;
$L__BB0_282:
- setp.lt.s32 %p185, %r133, 4;
+ setp.lt.s32 %p185, %r132, 4;
bar.sync 0;
@%p185 bra $L__BB0_287;
- mov.u32 %r608, %r134;
+ mov.u32 %r602, %r133;
$L__BB0_284:
- setp.ge.u32 %p186, %r5, %r608;
+ setp.ge.u32 %p186, %r5, %r602;
@%p186 bra $L__BB0_286;
- add.s32 %r545, %r608, %r50;
- mul.wide.s32 %rd226, %r545, 4;
+ add.s32 %r544, %r602, %r49;
+ mul.wide.s32 %rd226, %r544, 4;
add.s64 %rd228, %rd45, %rd226;
ld.shared.f32 %f589, [%rd23];
ld.shared.f32 %f590, [%rd228];
add.f32 %f591, %f590, %f589;
st.shared.f32 [%rd23], %f591;
$L__BB0_286:
bar.sync 0;
- shr.u32 %r171, %r608, 1;
- setp.gt.u32 %p187, %r608, 3;
- mov.u32 %r608, %r171;
+ shr.u32 %r170, %r602, 1;
+ setp.gt.u32 %p187, %r602, 3;
+ mov.u32 %r602, %r170;
@%p187 bra $L__BB0_284;
$L__BB0_287:
mov.f32 %f686, 0f00000000;
@%p8 bra $L__BB0_290;
@@ -2069,29 +2063,29 @@
$L__BB0_292:
bar.sync 0;
@%p185 bra $L__BB0_297;
- mov.u32 %r609, %r134;
+ mov.u32 %r603, %r133;
$L__BB0_294:
- setp.ge.u32 %p192, %r5, %r609;
+ setp.ge.u32 %p192, %r5, %r603;
@%p192 bra $L__BB0_296;
- add.s32 %r546, %r609, %r50;
- mul.wide.s32 %rd229, %r546, 4;
+ add.s32 %r545, %r603, %r49;
+ mul.wide.s32 %rd229, %r545, 4;
add.s64 %rd231, %rd45, %rd229;
ld.shared.f32 %f599, [%rd23];
ld.shared.f32 %f600, [%rd231];
add.f32 %f601, %f600, %f599;
st.shared.f32 [%rd23], %f601;
$L__BB0_296:
bar.sync 0;
- shr.u32 %r173, %r609, 1;
- setp.gt.u32 %p193, %r609, 3;
- mov.u32 %r609, %r173;
+ shr.u32 %r172, %r603, 1;
+ setp.gt.u32 %p193, %r603, 3;
+ mov.u32 %r603, %r172;
@%p193 bra $L__BB0_294;
$L__BB0_297:
mov.f32 %f687, 0f00000000;
@%p8 bra $L__BB0_300;
@@ -2110,74 +2104,74 @@
{ cvt.rn.bf16.f32 %rs132, %f687;}
@%p8 bra $L__BB0_303;
- mul.lo.s32 %r174, %r131, %r596;
- add.s32 %r547, %r130, %r174;
- setp.ge.s32 %p197, %r547, %r213;
+ mul.lo.s32 %r173, %r130, %r590;
+ add.s32 %r546, %r129, %r173;
+ setp.ge.s32 %p197, %r546, %r212;
@%p197 bra $L__BB0_303;
- add.s32 %r548, %r132, %r174;
- mul.wide.s32 %rd232, %r548, 2;
+ add.s32 %r547, %r131, %r173;
+ mul.wide.s32 %rd232, %r547, 2;
add.s64 %rd233, %rd31, %rd232;
st.global.v2.u16 [%rd233], {%rs131, %rs132};
bra.uni $L__BB0_303;
$L__BB0_241:
- setp.lt.s32 %p160, %r127, 1;
+ setp.lt.s32 %p160, %r126, 1;
@%p160 bra $L__BB0_273;
- div.s32 %r136, %r128, %r3;
- mad.lo.s32 %r137, %r213, %r5, %r129;
- shl.b32 %r138, %r124, 1;
- shl.b32 %r139, %r11, 1;
- mul.lo.s32 %r140, %r213, %r3;
- mov.u32 %r597, 0;
+ div.s32 %r135, %r127, %r3;
+ mad.lo.s32 %r136, %r212, %r5, %r128;
+ shl.b32 %r137, %r123, 1;
+ shl.b32 %r138, %r11, 1;
+ mul.lo.s32 %r139, %r212, %r3;
+ mov.u32 %r591, 0;
$L__BB0_243:
.pragma "nounroll";
- setp.lt.s32 %p161, %r136, 1;
+ setp.lt.s32 %p161, %r135, 1;
mov.f32 %f678, 0f00000000;
mov.f32 %f679, %f678;
@%p161 bra $L__BB0_249;
- mad.lo.s32 %r142, %r131, %r597, %r130;
- mad.lo.s32 %r525, %r139, %r597, %r138;
- mad.lo.s32 %r599, %r4, %r525, %r137;
- mov.u32 %r524, 0;
+ mad.lo.s32 %r141, %r130, %r591, %r129;
+ mad.lo.s32 %r524, %r138, %r591, %r137;
+ mad.lo.s32 %r593, %r4, %r524, %r136;
+ mov.u32 %r523, 0;
mov.f32 %f678, 0f00000000;
- mov.u32 %r598, %r5;
- mov.u32 %r600, %r524;
+ mov.u32 %r592, %r5;
+ mov.u32 %r594, %r523;
$L__BB0_245:
.pragma "nounroll";
- setp.ge.s32 %p162, %r142, %r213;
- mov.u32 %r601, %r524;
- mov.u32 %r602, %r524;
+ setp.ge.s32 %p162, %r141, %r212;
+ mov.u32 %r595, %r523;
+ mov.u32 %r596, %r523;
@%p162 bra $L__BB0_248;
- setp.ge.s32 %p163, %r598, %r11;
- mov.u32 %r601, %r524;
- mov.u32 %r602, %r524;
+ setp.ge.s32 %p163, %r592, %r11;
+ mov.u32 %r595, %r523;
+ mov.u32 %r596, %r523;
@%p163 bra $L__BB0_248;
- mul.wide.s32 %rd215, %r599, 4;
+ mul.wide.s32 %rd215, %r593, 4;
add.s64 %rd214, %rd42, %rd215;
- ld.volatile.global.v2.s32 {%r602,%r601}, [%rd214];
+ ld.volatile.global.v2.s32 {%r596,%r595}, [%rd214];
$L__BB0_248:
- mov.b32 %f558, %r602;
+ mov.b32 %f558, %r596;
add.f32 %f679, %f679, %f558;
- mov.b32 %f559, %r601;
+ mov.b32 %f559, %r595;
add.f32 %f678, %f678, %f559;
- add.s32 %r599, %r599, %r140;
- add.s32 %r598, %r598, %r3;
- add.s32 %r600, %r600, 1;
- setp.lt.s32 %p164, %r600, %r136;
+ add.s32 %r593, %r593, %r139;
+ add.s32 %r592, %r592, %r3;
+ add.s32 %r594, %r594, 1;
+ setp.lt.s32 %p164, %r594, %r135;
@%p164 bra $L__BB0_245;
$L__BB0_249:
st.shared.f32 [%rd23], %f679;
bar.sync 0;
@@ -2187,33 +2181,33 @@
ld.shared.f32 %f561, [%rd23];
add.f32 %f562, %f560, %f561;
st.shared.f32 [%rd23], %f562;
$L__BB0_251:
- setp.lt.s32 %p166, %r133, 4;
+ setp.lt.s32 %p166, %r132, 4;
bar.sync 0;
@%p166 bra $L__BB0_256;
- mov.u32 %r603, %r134;
+ mov.u32 %r597, %r133;
$L__BB0_253:
- setp.ge.u32 %p167, %r5, %r603;
+ setp.ge.u32 %p167, %r5, %r597;
@%p167 bra $L__BB0_255;
- add.s32 %r532, %r603, %r50;
- mul.wide.s32 %rd216, %r532, 4;
+ add.s32 %r531, %r597, %r49;
+ mul.wide.s32 %rd216, %r531, 4;
add.s64 %rd218, %rd45, %rd216;
ld.shared.f32 %f563, [%rd23];
ld.shared.f32 %f564, [%rd218];
add.f32 %f565, %f564, %f563;
st.shared.f32 [%rd23], %f565;
$L__BB0_255:
bar.sync 0;
- shr.u32 %r155, %r603, 1;
- setp.gt.u32 %p168, %r603, 3;
- mov.u32 %r603, %r155;
+ shr.u32 %r154, %r597, 1;
+ setp.gt.u32 %p168, %r597, 3;
+ mov.u32 %r597, %r154;
@%p168 bra $L__BB0_253;
$L__BB0_256:
mov.f32 %f680, 0f00000000;
@%p8 bra $L__BB0_259;
@@ -2243,29 +2237,29 @@
$L__BB0_261:
bar.sync 0;
@%p166 bra $L__BB0_266;
- mov.u32 %r604, %r134;
+ mov.u32 %r598, %r133;
$L__BB0_263:
- setp.ge.u32 %p173, %r5, %r604;
+ setp.ge.u32 %p173, %r5, %r598;
@%p173 bra $L__BB0_265;
- add.s32 %r533, %r604, %r50;
- mul.wide.s32 %rd219, %r533, 4;
+ add.s32 %r532, %r598, %r49;
+ mul.wide.s32 %rd219, %r532, 4;
add.s64 %rd221, %rd45, %rd219;
ld.shared.f32 %f573, [%rd23];
ld.shared.f32 %f574, [%rd221];
add.f32 %f575, %f574, %f573;
st.shared.f32 [%rd23], %f575;
$L__BB0_265:
bar.sync 0;
- shr.u32 %r157, %r604, 1;
- setp.gt.u32 %p174, %r604, 3;
- mov.u32 %r604, %r157;
+ shr.u32 %r156, %r598, 1;
+ setp.gt.u32 %p174, %r598, 3;
+ mov.u32 %r598, %r156;
@%p174 bra $L__BB0_263;
$L__BB0_266:
mov.f32 %f681, 0f00000000;
@%p8 bra $L__BB0_269;
@@ -2284,23 +2278,23 @@
{ cvt.rn.bf16.f32 %rs130, %f681;}
@%p8 bra $L__BB0_272;
- mul.lo.s32 %r158, %r131, %r597;
- add.s32 %r534, %r130, %r158;
- setp.ge.s32 %p178, %r534, %r213;
+ mul.lo.s32 %r157, %r130, %r591;
+ add.s32 %r533, %r129, %r157;
+ setp.ge.s32 %p178, %r533, %r212;
@%p178 bra $L__BB0_272;
- add.s32 %r535, %r132, %r158;
- mul.wide.s32 %rd222, %r535, 2;
+ add.s32 %r534, %r131, %r157;
+ mul.wide.s32 %rd222, %r534, 2;
add.s64 %rd223, %rd32, %rd222;
st.global.v2.u16 [%rd223], {%rs129, %rs130};
$L__BB0_272:
- add.s32 %r597, %r597, 1;
- setp.lt.s32 %p179, %r597, %r127;
+ add.s32 %r591, %r591, 1;
+ setp.lt.s32 %p179, %r591, %r126;
@%p179 bra $L__BB0_243;
$L__BB0_273:
ret;
22: CombinedSchedulerTest.LayerNormBackward/dtype___bfloat_batch_216_hidden_768
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 60→ 56
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<779>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r202, %r203}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r212, %r213}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r216, %r217}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r238, %r203, 7;
shr.s32 %r239, %r238, 31;
shr.u32 %r240, %r239, 29;
add.s32 %r241, %r238, %r240;
shr.s32 %r2, %r241, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r242, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r243, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r244, %r4, %r2;
shl.b32 %r245, %r244, 4;
or.b32 %r246, %r245, 15;
and.b32 %r7, %r246, -16;
add.s32 %r247, %r246, %r7;
and.b32 %r248, %r247, -16;
cvt.s64.s32 %rd1, %r248;
max.s32 %r249, %r2, %r3;
add.s32 %r250, %r249, 31;
shr.s32 %r251, %r250, 31;
shr.u32 %r252, %r251, 27;
add.s32 %r253, %r250, %r252;
shr.u32 %r254, %r253, 5;
mul.lo.s32 %r255, %r4, %r254;
shl.b32 %r256, %r255, 7;
cvt.u64.u32 %rd2, %r256;
mov.u64 %rd44, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r257, %r8, 7;
setp.lt.s32 %p7, %r257, %r203;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
// end inline asm
shl.b32 %r261, %r5, 4;
add.s32 %r259, %r258, %r261;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r260, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r260, 0;
cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r735, %r6, 4;
add.s32 %r262, %r4, 215;
div.s32 %r263, %r262, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r264, %r11, %r263;
add.s32 %r265, %r264, -1;
div.s32 %r12, %r265, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r203;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r267, %ctaid.y;
mul.lo.s32 %r268, %r12, %r4;
mul.lo.s32 %r13, %r268, %r267;
shl.b32 %r269, %r9, 1;
shl.b32 %r270, %r5, 4;
mad.lo.s32 %r14, %r269, %r203, %r270;
mul.lo.s32 %r271, %r203, %r9;
cvt.s64.s32 %rd53, %r271;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r272, %r13, %r203;
cvt.s64.s32 %rd6, %r272;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
add.s32 %r15, %r271, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r275, %r274, %r16;
shr.u32 %r17, %r5, 5;
add.s32 %r276, %r275, %r17;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r18, %r5, 31;
add.s32 %r277, %r275, %r18;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r734, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r14, %r280;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r14, %r283;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r734, %r4;
add.s32 %r278, %r23, %r9;
add.s32 %r24, %r278, %r13;
setp.gt.s32 %p13, %r24, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r24, %r212;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r24, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r23, %r203;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r24, %r216;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ mov.b32 %f234, {0,%rs36};}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ mov.b32 %f235, {0,%rs37};}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ mov.b32 %f236, {0,%rs38};}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ mov.b32 %f237, {0,%rs39};}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ mov.b32 %f238, {0,%rs40};}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ mov.b32 %f239, {0,%rs41};}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ mov.b32 %f240, {0,%rs42};}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ mov.b32 %f241, {0,%rs43};}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ mov.b32 %f242, {0,%rs44};}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ mov.b32 %f243, {0,%rs45};}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ mov.b32 %f244, {0,%rs46};}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ mov.b32 %f245, {0,%rs47};}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ mov.b32 %f246, {0,%rs48};}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ mov.b32 %f247, {0,%rs49};}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ mov.b32 %f248, {0,%rs50};}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ mov.b32 %f249, {0,%rs51};}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ mov.b32 %f250, {0,%rs52};}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ mov.b32 %f251, {0,%rs53};}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ mov.b32 %f252, {0,%rs54};}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ mov.b32 %f253, {0,%rs55};}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ mov.b32 %f254, {0,%rs56};}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ mov.b32 %f255, {0,%rs57};}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ mov.b32 %f256, {0,%rs58};}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ mov.b32 %f257, {0,%rs59};}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r735, %r735, 2;
bar.sync 0;
setp.ne.s32 %p23, %r18, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r17, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r18, %r16;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r18, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r18, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r17, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r18, %r16;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r18, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ mov.b32 %f374, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ mov.b32 %f375, {0,%rs98};}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ mov.b32 %f376, {0,%rs99};}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ mov.b32 %f378, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f379, {0,%rs102};}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ mov.b32 %f380, {0,%rs103};}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ mov.b32 %f382, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ mov.b32 %f383, {0,%rs106};}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ mov.b32 %f384, {0,%rs107};}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ mov.b32 %f386, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f387, {0,%rs110};}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ mov.b32 %f388, {0,%rs111};}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ mov.b32 %f390, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ mov.b32 %f391, {0,%rs114};}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ mov.b32 %f392, {0,%rs115};}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ mov.b32 %f394, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f395, {0,%rs118};}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ mov.b32 %f396, {0,%rs119};}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ mov.b32 %f398, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ mov.b32 %f399, {0,%rs122};}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ mov.b32 %f400, {0,%rs123};}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ mov.b32 %f402, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f403, {0,%rs126};}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ mov.b32 %f404, {0,%rs127};}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
add.s32 %r416, %r13, %r732;
mad.lo.s32 %r417, %r416, %r203, %r15;
mul.wide.s32 %rd76, %r417, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r734, %r734, 1;
setp.lt.s32 %p49, %r734, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r418, %tid.z;
mad.lo.s32 %r46, %r418, %r4, %r9;
mad.lo.s32 %r47, %r46, %r3, %r5;
mul.wide.u32 %rd77, %r47, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r419, %r4;
mov.u32 %r420, 31;
sub.s32 %r48, %r420, %r419;
mov.u32 %r421, 1;
shl.b32 %r766, %r421, %r48;
setp.lt.u32 %p50, %r9, %r766;
add.s32 %r422, %r766, %r9;
setp.lt.u32 %p51, %r422, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r423, %r3, %r48;
add.s32 %r424, %r47, %r423;
mul.wide.s32 %rd79, %r424, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r766, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r736, %r766;
$L__BB0_40:
shr.u32 %r51, %r736, 1;
setp.ge.u32 %p54, %r9, %r51;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r425, %r51, %r3, %r47;
mul.wide.s32 %rd82, %r425, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r736, 7;
mov.u32 %r736, %r51;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r737, 0;
add.s32 %r427, %r47, %r3;
mul.wide.u32 %rd85, %r427, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r737, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r428, %r3, %r48;
add.s32 %r429, %r47, %r428;
mul.wide.s32 %rd87, %r429, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r738, %r766;
$L__BB0_51:
shr.u32 %r55, %r738, 1;
setp.ge.u32 %p60, %r9, %r55;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r430, %r55, %r3, %r47;
mul.wide.s32 %rd90, %r430, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r738, 7;
mov.u32 %r738, %r55;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r739, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r432, %r3, %r48;
add.s32 %r433, %r47, %r432;
mul.wide.s32 %rd93, %r433, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r740, %r766;
$L__BB0_62:
shr.u32 %r59, %r740, 1;
setp.ge.u32 %p66, %r9, %r59;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r434, %r59, %r3, %r47;
mul.wide.s32 %rd96, %r434, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r740, 7;
mov.u32 %r740, %r59;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r741, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r436, %r3, %r48;
add.s32 %r437, %r47, %r436;
mul.wide.s32 %rd99, %r437, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r742, %r766;
$L__BB0_73:
shr.u32 %r63, %r742, 1;
setp.ge.u32 %p72, %r9, %r63;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r438, %r63, %r3, %r47;
mul.wide.s32 %rd102, %r438, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r742, 7;
mov.u32 %r742, %r63;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r743, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r440, %r3, %r48;
add.s32 %r441, %r47, %r440;
mul.wide.s32 %rd105, %r441, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r744, %r766;
$L__BB0_84:
shr.u32 %r67, %r744, 1;
setp.ge.u32 %p78, %r9, %r67;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r442, %r67, %r3, %r47;
mul.wide.s32 %rd108, %r442, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r744, 7;
mov.u32 %r744, %r67;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r745, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r444, %r3, %r48;
add.s32 %r445, %r47, %r444;
mul.wide.s32 %rd111, %r445, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r746, %r766;
$L__BB0_95:
shr.u32 %r71, %r746, 1;
setp.ge.u32 %p84, %r9, %r71;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r446, %r71, %r3, %r47;
mul.wide.s32 %rd114, %r446, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r746, 7;
mov.u32 %r746, %r71;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r747, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r448, %r3, %r48;
add.s32 %r449, %r47, %r448;
mul.wide.s32 %rd117, %r449, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r748, %r766;
$L__BB0_106:
shr.u32 %r75, %r748, 1;
setp.ge.u32 %p90, %r9, %r75;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r450, %r75, %r3, %r47;
mul.wide.s32 %rd120, %r450, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r748, 7;
mov.u32 %r748, %r75;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r749, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r452, %r3, %r48;
add.s32 %r453, %r47, %r452;
mul.wide.s32 %rd123, %r453, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r750, %r766;
$L__BB0_117:
shr.u32 %r79, %r750, 1;
setp.ge.u32 %p96, %r9, %r79;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r454, %r79, %r3, %r47;
mul.wide.s32 %rd126, %r454, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r750, 7;
mov.u32 %r750, %r79;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r751, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r82, %r735, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r456, %r3, %r48;
add.s32 %r457, %r47, %r456;
mul.wide.s32 %rd129, %r457, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r752, %r766;
$L__BB0_128:
shr.u32 %r84, %r752, 1;
setp.ge.u32 %p102, %r9, %r84;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r458, %r84, %r3, %r47;
mul.wide.s32 %rd132, %r458, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r752, 7;
mov.u32 %r752, %r84;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r753, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r460, %r3, %r48;
add.s32 %r461, %r47, %r460;
mul.wide.s32 %rd135, %r461, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r754, %r766;
$L__BB0_139:
shr.u32 %r88, %r754, 1;
setp.ge.u32 %p108, %r9, %r88;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r462, %r88, %r3, %r47;
mul.wide.s32 %rd138, %r462, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r754, 7;
mov.u32 %r754, %r88;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r755, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r464, %r3, %r48;
add.s32 %r465, %r47, %r464;
mul.wide.s32 %rd141, %r465, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r756, %r766;
$L__BB0_150:
shr.u32 %r92, %r756, 1;
setp.ge.u32 %p114, %r9, %r92;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r466, %r92, %r3, %r47;
mul.wide.s32 %rd144, %r466, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r756, 7;
mov.u32 %r756, %r92;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r757, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r468, %r3, %r48;
add.s32 %r469, %r47, %r468;
mul.wide.s32 %rd147, %r469, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r758, %r766;
$L__BB0_161:
shr.u32 %r96, %r758, 1;
setp.ge.u32 %p120, %r9, %r96;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r470, %r96, %r3, %r47;
mul.wide.s32 %rd150, %r470, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r758, 7;
mov.u32 %r758, %r96;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r759, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r472, %r3, %r48;
add.s32 %r473, %r47, %r472;
mul.wide.s32 %rd153, %r473, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r760, %r766;
$L__BB0_172:
shr.u32 %r100, %r760, 1;
setp.ge.u32 %p126, %r9, %r100;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r474, %r100, %r3, %r47;
mul.wide.s32 %rd156, %r474, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r760, 7;
mov.u32 %r760, %r100;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r761, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r476, %r3, %r48;
add.s32 %r477, %r47, %r476;
mul.wide.s32 %rd159, %r477, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r762, %r766;
$L__BB0_183:
shr.u32 %r104, %r762, 1;
setp.ge.u32 %p132, %r9, %r104;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r478, %r104, %r3, %r47;
mul.wide.s32 %rd162, %r478, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r762, 7;
mov.u32 %r762, %r104;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r763, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r763, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r480, %r3, %r48;
add.s32 %r481, %r47, %r480;
mul.wide.s32 %rd165, %r481, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r764, %r766;
$L__BB0_194:
shr.u32 %r108, %r764, 1;
setp.ge.u32 %p138, %r9, %r108;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r482, %r108, %r3, %r47;
mul.wide.s32 %rd168, %r482, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r764, 7;
mov.u32 %r764, %r108;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r765, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r765, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r484, %r3, %r48;
add.s32 %r485, %r47, %r484;
mul.wide.s32 %rd171, %r485, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r112, %r766, 1;
setp.ge.u32 %p144, %r9, %r112;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r486, %r112, %r3, %r47;
mul.wide.s32 %rd174, %r486, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r766, 7;
mov.u32 %r766, %r112;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r767, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r767, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
shl.b32 %r731, %r5, 3;
mov.u32 %r512, %ctaid.y;
mad.lo.s32 %r513, %r203, %r512, %r731;
add.s32 %r514, %r513, %r82;
mul.wide.s32 %rd183, %r514, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
// end inline asm
add.s32 %r515, %r514, 4;
mul.wide.s32 %rd184, %r515, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r488, %r728, 3;
sub.s32 %r115, %r488, %r203;
mov.u32 %r489, %ctaid.y;
mad.lo.s32 %r116, %r203, %r489, %r728;
neg.s32 %r490, %r82;
setp.ge.s32 %p151, %r115, %r490;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r495, %r116, %r82;
mul.wide.s32 %rd178, %r495, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
// end inline asm
$L__BB0_214:
mov.u32 %r496, -4;
sub.s32 %r497, %r496, %r82;
setp.ge.s32 %p153, %r115, %r497;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r502, %r116, %r82;
add.s32 %r503, %r502, 4;
mul.wide.s32 %rd180, %r503, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r730, %r5, 3;
shl.b32 %r540, %r735, 5;
mov.u32 %r541, %ctaid.y;
mad.lo.s32 %r542, %r203, %r541, %r730;
add.s32 %r543, %r542, %r540;
mul.wide.s32 %rd191, %r543, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
// end inline asm
add.s32 %r544, %r543, 4;
mul.wide.s32 %rd192, %r544, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r516, %r729, 3;
sub.s32 %r117, %r516, %r203;
mov.u32 %r517, %ctaid.y;
mad.lo.s32 %r118, %r203, %r517, %r729;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r119, %r735, 5;
neg.s32 %r518, %r119;
setp.ge.s32 %p160, %r117, %r518;
@%p160 bra $L__BB0_222;
add.s32 %r523, %r118, %r119;
mul.wide.s32 %rd186, %r523, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r120, %r735, 5;
mov.u32 %r524, -4;
sub.s32 %r525, %r524, %r120;
setp.ge.s32 %p162, %r117, %r525;
@%p162 bra $L__BB0_227;
add.s32 %r530, %r118, %r120;
add.s32 %r531, %r530, 4;
mul.wide.s32 %rd188, %r531, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
// end inline asm
$L__BB0_227:
mov.u32 %r121, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r545, %r5, %r9;
or.b32 %r547, %r545, %r418;
setp.ne.s32 %p164, %r547, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r548, %ctaid.x;
mov.u32 %r549, %ctaid.z;
mov.u32 %r550, %nctaid.x;
mad.lo.s32 %r551, %r549, %r550, %r548;
mul.wide.s32 %rd194, %r551, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r552, %r11, -1;
setp.eq.s32 %p165, %r121, %r552;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r768, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r768;
// end inline asm
setp.lt.u32 %p167, %r768, 256;
selp.u32 %r555, 1, 0, %p167;
shl.b32 %r768, %r768, %r555;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_e501cd20_1033910nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r557, %r203, 1;
shr.u32 %r558, %r557, 31;
add.s32 %r559, %r557, %r558;
shr.s32 %r560, %r559, 1;
add.s32 %r561, %r4, %r560;
add.s32 %r562, %r561, -1;
div.s32 %r563, %r562, %r4;
add.s32 %r564, %r11, -1;
add.s32 %r565, %r564, %r563;
div.s32 %r124, %r565, %r11;
add.s32 %r125, %r564, %r3;
shl.b32 %r126, %r9, 1;
shl.b32 %r566, %r4, 1;
mad.lo.s32 %r129, %r566, %r121, %r126;
or.b32 %r127, %r129, 1;
mul.lo.s32 %r128, %r566, %r11;
shr.u32 %r130, %r3, 5;
mul.lo.s32 %r567, %r46, %r130;
shr.u32 %r131, %r5, 5;
add.s32 %r568, %r567, %r131;
mul.wide.u32 %rd203, %r568, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r132, %r5, 31;
add.s32 %r569, %r567, %r132;
mul.wide.u32 %rd205, %r569, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r769, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r769, %r769, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r769, %r124;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r154, %r125, %r3;
setp.lt.s32 %p206, %r154, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r650, %r128, %r769;
add.s32 %r155, %r127, %r650;
add.s32 %r156, %r129, %r650;
mov.u32 %r649, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r776, %r649;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r155, %r203;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r158, %r776, %r3, %r5;
setp.ge.s32 %p208, %r158, %r11;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r657, %r158, %r203, %r156;
mul.wide.s32 %rd211, %r657, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r778;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r777;
add.f32 %f769, %f769, %f643;
add.s32 %r776, %r776, 1;
setp.lt.s32 %p209, %r776, %r154;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r658, %f770;
mov.u32 %r659, 31;
mov.u32 %r660, 16;
mov.u32 %r661, -1;
shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
mov.b32 %f644, %r662;
add.f32 %f645, %f770, %f644;
mov.b32 %r663, %f645;
mov.u32 %r664, 8;
shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
mov.b32 %f646, %r665;
add.f32 %f647, %f645, %f646;
mov.b32 %r666, %f647;
mov.u32 %r667, 4;
shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
mov.b32 %f648, %r668;
add.f32 %f649, %f647, %f648;
mov.b32 %r669, %f649;
mov.u32 %r670, 2;
shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
mov.b32 %f650, %r671;
add.f32 %f651, %f649, %f650;
mov.b32 %r672, %f651;
mov.u32 %r673, 1;
shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
mov.b32 %f652, %r674;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r132, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r131, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r132, %r130;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r675, %f771;
mov.u32 %r676, 31;
mov.u32 %r677, 16;
mov.u32 %r678, -1;
shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
mov.b32 %f654, %r679;
add.f32 %f655, %f771, %f654;
mov.b32 %r680, %f655;
mov.u32 %r681, 8;
shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
mov.b32 %f656, %r682;
add.f32 %f657, %f655, %f656;
mov.b32 %r683, %f657;
mov.u32 %r684, 4;
shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
mov.b32 %f658, %r685;
add.f32 %f659, %f657, %f658;
mov.b32 %r686, %f659;
mov.u32 %r687, 2;
shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
mov.b32 %f660, %r688;
add.f32 %f661, %f659, %f660;
mov.b32 %r689, %f661;
mov.u32 %r690, 1;
shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
mov.b32 %f662, %r691;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r132, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r692, %f769;
mov.u32 %r693, 31;
mov.u32 %r694, 16;
mov.u32 %r695, -1;
shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
mov.b32 %f665, %r696;
add.f32 %f666, %f769, %f665;
mov.b32 %r697, %f666;
mov.u32 %r698, 8;
shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
mov.b32 %f667, %r699;
add.f32 %f668, %f666, %f667;
mov.b32 %r700, %f668;
mov.u32 %r701, 4;
shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
mov.b32 %f669, %r702;
add.f32 %f670, %f668, %f669;
mov.b32 %r703, %f670;
mov.u32 %r704, 2;
shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
mov.b32 %f671, %r705;
add.f32 %f672, %f670, %f671;
mov.b32 %r706, %f672;
mov.u32 %r707, 1;
shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
mov.b32 %f673, %r708;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r132, %r130;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r709, %f773;
mov.u32 %r710, 31;
mov.u32 %r711, 16;
mov.u32 %r712, -1;
shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
mov.b32 %f675, %r713;
add.f32 %f676, %f773, %f675;
mov.b32 %r714, %f676;
mov.u32 %r715, 8;
shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
mov.b32 %f677, %r716;
add.f32 %f678, %f676, %f677;
mov.b32 %r717, %f678;
mov.u32 %r718, 4;
shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
mov.b32 %f679, %r719;
add.f32 %f680, %f678, %f679;
mov.b32 %r720, %f680;
mov.u32 %r721, 2;
shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
mov.b32 %f681, %r722;
add.f32 %f682, %f680, %f681;
mov.b32 %r723, %f682;
mov.u32 %r724, 1;
shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
mov.b32 %f683, %r725;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r164, %r128, %r769;
add.s32 %r726, %r127, %r164;
setp.ge.s32 %p239, %r726, %r203;
@%p239 bra $L__BB0_279;
add.s32 %r727, %r129, %r164;
mul.wide.s32 %rd212, %r727, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r124, 1;
@%p170 bra $L__BB0_257;
div.s32 %r134, %r125, %r3;
mad.lo.s32 %r135, %r203, %r5, %r126;
shl.b32 %r136, %r121, 1;
shl.b32 %r137, %r11, 1;
mul.lo.s32 %r138, %r203, %r3;
mov.u32 %r770, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r134, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r140, %r128, %r770, %r127;
mad.lo.s32 %r572, %r137, %r770, %r136;
mad.lo.s32 %r772, %r4, %r572, %r135;
mov.u32 %r571, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r771, %r5;
mov.u32 %r773, %r571;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r140, %r203;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r771, %r11;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r772, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r775;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r774;
add.f32 %f761, %f761, %f595;
add.s32 %r772, %r772, %r138;
add.s32 %r771, %r771, %r3;
add.s32 %r773, %r773, 1;
setp.lt.s32 %p174, %r773, %r134;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r579, %f762;
mov.u32 %r580, 31;
mov.u32 %r581, 16;
mov.u32 %r582, -1;
shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
mov.b32 %f596, %r583;
add.f32 %f597, %f762, %f596;
mov.b32 %r584, %f597;
mov.u32 %r585, 8;
shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
mov.b32 %f598, %r586;
add.f32 %f599, %f597, %f598;
mov.b32 %r587, %f599;
mov.u32 %r588, 4;
shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
mov.b32 %f600, %r589;
add.f32 %f601, %f599, %f600;
mov.b32 %r590, %f601;
mov.u32 %r591, 2;
shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
mov.b32 %f602, %r592;
add.f32 %f603, %f601, %f602;
mov.b32 %r593, %f603;
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
mov.b32 %f604, %r595;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r132, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r131, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r132, %r130;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r596, %f763;
mov.u32 %r597, 31;
mov.u32 %r598, 16;
mov.u32 %r599, -1;
shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
mov.b32 %f606, %r600;
add.f32 %f607, %f763, %f606;
mov.b32 %r601, %f607;
mov.u32 %r602, 8;
shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
mov.b32 %f608, %r603;
add.f32 %f609, %f607, %f608;
mov.b32 %r604, %f609;
mov.u32 %r605, 4;
shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
mov.b32 %f610, %r606;
add.f32 %f611, %f609, %f610;
mov.b32 %r607, %f611;
mov.u32 %r608, 2;
shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
mov.b32 %f612, %r609;
add.f32 %f613, %f611, %f612;
mov.b32 %r610, %f613;
mov.u32 %r611, 1;
shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
mov.b32 %f614, %r612;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r132, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r613, %f761;
mov.u32 %r614, 31;
mov.u32 %r615, 16;
mov.u32 %r616, -1;
shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
mov.b32 %f617, %r617;
add.f32 %f618, %f761, %f617;
mov.b32 %r618, %f618;
mov.u32 %r619, 8;
shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
mov.b32 %f619, %r620;
add.f32 %f620, %f618, %f619;
mov.b32 %r621, %f620;
mov.u32 %r622, 4;
shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
mov.b32 %f621, %r623;
add.f32 %f622, %f620, %f621;
mov.b32 %r624, %f622;
mov.u32 %r625, 2;
shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
mov.b32 %f623, %r626;
add.f32 %f624, %f622, %f623;
mov.b32 %r627, %f624;
mov.u32 %r628, 1;
shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
mov.b32 %f625, %r629;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r132, %r130;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r630, %f765;
mov.u32 %r631, 31;
mov.u32 %r632, 16;
mov.u32 %r633, -1;
shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
mov.b32 %f627, %r634;
add.f32 %f628, %f765, %f627;
mov.b32 %r635, %f628;
mov.u32 %r636, 8;
shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
mov.b32 %f629, %r637;
add.f32 %f630, %f628, %f629;
mov.b32 %r638, %f630;
mov.u32 %r639, 4;
shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
mov.b32 %f631, %r640;
add.f32 %f632, %f630, %f631;
mov.b32 %r641, %f632;
mov.u32 %r642, 2;
shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
mov.b32 %f633, %r643;
add.f32 %f634, %f632, %f633;
mov.b32 %r644, %f634;
mov.u32 %r645, 1;
shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
mov.b32 %f635, %r646;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r152, %r128, %r770;
add.s32 %r647, %r127, %r152;
setp.ge.s32 %p204, %r647, %r203;
@%p204 bra $L__BB0_256;
add.s32 %r648, %r129, %r152;
mul.wide.s32 %rd208, %r648, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r770, %r770, 1;
setp.lt.s32 %p205, %r770, %r124;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
max.s32 %r248, %r2, %r3;
add.s32 %r249, %r248, 31;
shr.s32 %r250, %r249, 31;
shr.u32 %r251, %r250, 27;
add.s32 %r252, %r249, %r251;
shr.u32 %r253, %r252, 5;
mul.lo.s32 %r254, %r4, %r253;
shl.b32 %r255, %r254, 7;
cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r256, %r8, 7;
setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
shl.b32 %r260, %r5, 4;
add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r729, %r6, 4;
add.s32 %r261, %r4, 215;
div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r263, %r11, %r262;
add.s32 %r264, %r263, -1;
div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r266, %ctaid.y;
mul.lo.s32 %r267, %r12, %r4;
mul.lo.s32 %r13, %r267, %r266;
mad.lo.s32 %r268, %r2, %r9, %r5;
shl.b32 %r14, %r268, 4;
mul.lo.s32 %r269, %r202, %r9;
cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r270, %r13, %r202;
cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
shl.b32 %r271, %r9, 3;
mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r275, %r274, %r15;
shr.u32 %r16, %r5, 5;
add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r17, %r5, 31;
add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r280, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r728, %r4;
add.s32 %r278, %r22, %r9;
add.s32 %r23, %r278, %r13;
setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ mov.b32 %f234, {0,%rs36};}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ mov.b32 %f235, {0,%rs37};}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ mov.b32 %f236, {0,%rs38};}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ mov.b32 %f237, {0,%rs39};}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ mov.b32 %f238, {0,%rs40};}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ mov.b32 %f239, {0,%rs41};}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ mov.b32 %f240, {0,%rs42};}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ mov.b32 %f241, {0,%rs43};}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ mov.b32 %f242, {0,%rs44};}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ mov.b32 %f243, {0,%rs45};}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ mov.b32 %f244, {0,%rs46};}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ mov.b32 %f245, {0,%rs47};}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ mov.b32 %f246, {0,%rs48};}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ mov.b32 %f247, {0,%rs49};}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ mov.b32 %f248, {0,%rs50};}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ mov.b32 %f249, {0,%rs51};}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ mov.b32 %f250, {0,%rs52};}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ mov.b32 %f251, {0,%rs53};}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ mov.b32 %f252, {0,%rs54};}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ mov.b32 %f253, {0,%rs55};}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ mov.b32 %f254, {0,%rs56};}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ mov.b32 %f255, {0,%rs57};}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ mov.b32 %f256, {0,%rs58};}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ mov.b32 %f257, {0,%rs59};}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r729, %r729, 2;
bar.sync 0;
setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ mov.b32 %f374, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ mov.b32 %f375, {0,%rs98};}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ mov.b32 %f376, {0,%rs99};}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ mov.b32 %f378, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f379, {0,%rs102};}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ mov.b32 %f380, {0,%rs103};}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ mov.b32 %f382, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ mov.b32 %f383, {0,%rs106};}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ mov.b32 %f384, {0,%rs107};}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ mov.b32 %f386, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f387, {0,%rs110};}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ mov.b32 %f388, {0,%rs111};}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ mov.b32 %f390, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ mov.b32 %f391, {0,%rs114};}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ mov.b32 %f392, {0,%rs115};}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ mov.b32 %f394, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f395, {0,%rs118};}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ mov.b32 %f396, {0,%rs119};}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ mov.b32 %f398, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ mov.b32 %f399, {0,%rs122};}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ mov.b32 %f400, {0,%rs123};}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ mov.b32 %f402, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f403, {0,%rs126};}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ mov.b32 %f404, {0,%rs127};}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
mad.lo.s32 %r416, %r23, %r202, %r8;
mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r728, %r728, 1;
setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r417, %tid.z;
mad.lo.s32 %r45, %r417, %r4, %r9;
mad.lo.s32 %r46, %r45, %r3, %r5;
mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r418, %r4;
mov.u32 %r419, 31;
sub.s32 %r47, %r419, %r418;
mov.u32 %r420, 1;
shl.b32 %r760, %r420, %r47;
setp.lt.u32 %p50, %r9, %r760;
add.s32 %r421, %r760, %r9;
setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r422, %r3, %r47;
add.s32 %r423, %r46, %r422;
mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r730, %r760;
$L__BB0_40:
shr.u32 %r50, %r730, 1;
setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r424, %r50, %r3, %r46;
mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r730, 7;
mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r731, 0;
add.s32 %r426, %r46, %r3;
mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r427, %r3, %r47;
add.s32 %r428, %r46, %r427;
mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r732, %r760;
$L__BB0_51:
shr.u32 %r54, %r732, 1;
setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r429, %r54, %r3, %r46;
mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r732, 7;
mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r431, %r3, %r47;
add.s32 %r432, %r46, %r431;
mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r734, %r760;
$L__BB0_62:
shr.u32 %r58, %r734, 1;
setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r433, %r58, %r3, %r46;
mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r734, 7;
mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r435, %r3, %r47;
add.s32 %r436, %r46, %r435;
mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r736, %r760;
$L__BB0_73:
shr.u32 %r62, %r736, 1;
setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r437, %r62, %r3, %r46;
mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r736, 7;
mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r439, %r3, %r47;
add.s32 %r440, %r46, %r439;
mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r738, %r760;
$L__BB0_84:
shr.u32 %r66, %r738, 1;
setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r441, %r66, %r3, %r46;
mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r738, 7;
mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r443, %r3, %r47;
add.s32 %r444, %r46, %r443;
mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r740, %r760;
$L__BB0_95:
shr.u32 %r70, %r740, 1;
setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r445, %r70, %r3, %r46;
mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r740, 7;
mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r447, %r3, %r47;
add.s32 %r448, %r46, %r447;
mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r742, %r760;
$L__BB0_106:
shr.u32 %r74, %r742, 1;
setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r449, %r74, %r3, %r46;
mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r742, 7;
mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r451, %r3, %r47;
add.s32 %r452, %r46, %r451;
mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r744, %r760;
$L__BB0_117:
shr.u32 %r78, %r744, 1;
setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r453, %r78, %r3, %r46;
mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r744, 7;
mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r455, %r3, %r47;
add.s32 %r456, %r46, %r455;
mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r746, %r760;
$L__BB0_128:
shr.u32 %r83, %r746, 1;
setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r457, %r83, %r3, %r46;
mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r746, 7;
mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r459, %r3, %r47;
add.s32 %r460, %r46, %r459;
mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r748, %r760;
$L__BB0_139:
shr.u32 %r87, %r748, 1;
setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r461, %r87, %r3, %r46;
mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r748, 7;
mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r463, %r3, %r47;
add.s32 %r464, %r46, %r463;
mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r750, %r760;
$L__BB0_150:
shr.u32 %r91, %r750, 1;
setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r465, %r91, %r3, %r46;
mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r750, 7;
mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r467, %r3, %r47;
add.s32 %r468, %r46, %r467;
mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r752, %r760;
$L__BB0_161:
shr.u32 %r95, %r752, 1;
setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r469, %r95, %r3, %r46;
mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r752, 7;
mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r471, %r3, %r47;
add.s32 %r472, %r46, %r471;
mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r754, %r760;
$L__BB0_172:
shr.u32 %r99, %r754, 1;
setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r473, %r99, %r3, %r46;
mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r754, 7;
mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r475, %r3, %r47;
add.s32 %r476, %r46, %r475;
mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r756, %r760;
$L__BB0_183:
shr.u32 %r103, %r756, 1;
setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r477, %r103, %r3, %r46;
mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r756, 7;
mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r479, %r3, %r47;
add.s32 %r480, %r46, %r479;
mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r758, %r760;
$L__BB0_194:
shr.u32 %r107, %r758, 1;
setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r481, %r107, %r3, %r46;
mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r758, 7;
mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r483, %r3, %r47;
add.s32 %r484, %r46, %r483;
mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r111, %r760, 1;
setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r485, %r111, %r3, %r46;
mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r760, 7;
mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
mov.u32 %r511, %ctaid.y;
mad.lo.s32 %r512, %r202, %r511, %r8;
add.s32 %r513, %r512, %r81;
mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
// end inline asm
add.s32 %r514, %r513, 4;
mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r487, %r8, 3;
sub.s32 %r114, %r487, %r202;
mov.u32 %r488, %ctaid.y;
mad.lo.s32 %r115, %r202, %r488, %r8;
neg.s32 %r489, %r81;
setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r494, %r115, %r81;
mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
// end inline asm
$L__BB0_214:
mov.u32 %r495, -4;
sub.s32 %r496, %r495, %r81;
setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r501, %r115, %r81;
add.s32 %r502, %r501, 4;
mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r539, %r729, 5;
mov.u32 %r540, %ctaid.y;
mad.lo.s32 %r541, %r202, %r540, %r8;
add.s32 %r542, %r541, %r539;
mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
// end inline asm
add.s32 %r543, %r542, 4;
mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r515, %r8, 3;
sub.s32 %r116, %r515, %r202;
mov.u32 %r516, %ctaid.y;
mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r118, %r729, 5;
neg.s32 %r517, %r118;
setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
add.s32 %r522, %r117, %r118;
mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r119, %r729, 5;
mov.u32 %r523, -4;
sub.s32 %r524, %r523, %r119;
setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
add.s32 %r529, %r117, %r119;
add.s32 %r530, %r529, 4;
mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
// end inline asm
$L__BB0_227:
mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r544, %r5, %r9;
or.b32 %r546, %r544, %r417;
setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r547, %ctaid.x;
mov.u32 %r548, %ctaid.z;
mov.u32 %r549, %nctaid.x;
mad.lo.s32 %r550, %r548, %r549, %r547;
mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r551, %r11, -1;
setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r762, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r762;
// end inline asm
setp.lt.u32 %p167, %r762, 256;
selp.u32 %r554, 1, 0, %p167;
shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_44_cu_1d87bf9c_723310nvfuser_44ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r556, %r202, 1;
shr.u32 %r557, %r556, 31;
add.s32 %r558, %r556, %r557;
shr.s32 %r559, %r558, 1;
add.s32 %r560, %r4, %r559;
add.s32 %r561, %r560, -1;
div.s32 %r562, %r561, %r4;
add.s32 %r563, %r11, -1;
add.s32 %r564, %r563, %r562;
div.s32 %r123, %r564, %r11;
add.s32 %r124, %r563, %r3;
shl.b32 %r125, %r9, 1;
shl.b32 %r565, %r4, 1;
mad.lo.s32 %r128, %r565, %r120, %r125;
or.b32 %r126, %r128, 1;
mul.lo.s32 %r127, %r565, %r11;
shr.u32 %r129, %r3, 5;
mul.lo.s32 %r566, %r45, %r129;
shr.u32 %r130, %r5, 5;
add.s32 %r567, %r566, %r130;
mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r131, %r5, 31;
add.s32 %r568, %r566, %r131;
mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r153, %r124, %r3;
setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r649, %r127, %r763;
add.s32 %r154, %r126, %r649;
add.s32 %r155, %r128, %r649;
mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r154, %r202;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r157, %r770, %r3, %r5;
setp.ge.s32 %p208, %r157, %r11;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r656, %r157, %r202, %r155;
mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
add.s32 %r770, %r770, 1;
setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r657, %f770;
mov.u32 %r658, 31;
mov.u32 %r659, 16;
mov.u32 %r660, -1;
shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
mov.b32 %r662, %f645;
mov.u32 %r663, 8;
shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
mov.b32 %r665, %f647;
mov.u32 %r666, 4;
shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
mov.b32 %r668, %f649;
mov.u32 %r669, 2;
shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
mov.b32 %r671, %f651;
mov.u32 %r672, 1;
shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r674, %f771;
mov.u32 %r675, 31;
mov.u32 %r676, 16;
mov.u32 %r677, -1;
shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
mov.b32 %r679, %f655;
mov.u32 %r680, 8;
shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
mov.b32 %r682, %f657;
mov.u32 %r683, 4;
shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
mov.b32 %r685, %f659;
mov.u32 %r686, 2;
shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
mov.b32 %r688, %f661;
mov.u32 %r689, 1;
shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r691, %f769;
mov.u32 %r692, 31;
mov.u32 %r693, 16;
mov.u32 %r694, -1;
shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
mov.b32 %r696, %f666;
mov.u32 %r697, 8;
shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
mov.b32 %r699, %f668;
mov.u32 %r700, 4;
shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
mov.b32 %r702, %f670;
mov.u32 %r703, 2;
shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
mov.b32 %r705, %f672;
mov.u32 %r706, 1;
shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r708, %f773;
mov.u32 %r709, 31;
mov.u32 %r710, 16;
mov.u32 %r711, -1;
shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
mov.b32 %r713, %f676;
mov.u32 %r714, 8;
shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
mov.b32 %r716, %f678;
mov.u32 %r717, 4;
shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
mov.b32 %r719, %f680;
mov.u32 %r720, 2;
shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
mov.b32 %r722, %f682;
mov.u32 %r723, 1;
shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r163, %r127, %r763;
add.s32 %r725, %r126, %r163;
setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
add.s32 %r726, %r128, %r163;
mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
div.s32 %r133, %r124, %r3;
mad.lo.s32 %r134, %r202, %r5, %r125;
shl.b32 %r135, %r120, 1;
shl.b32 %r136, %r11, 1;
mul.lo.s32 %r137, %r202, %r3;
mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r139, %r127, %r764, %r126;
mad.lo.s32 %r571, %r136, %r764, %r135;
mad.lo.s32 %r766, %r4, %r571, %r134;
mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r765, %r5;
mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r139, %r202;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r765, %r11;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
add.s32 %r766, %r766, %r137;
add.s32 %r765, %r765, %r3;
add.s32 %r767, %r767, 1;
setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r578, %f762;
mov.u32 %r579, 31;
mov.u32 %r580, 16;
mov.u32 %r581, -1;
shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
mov.b32 %r583, %f597;
mov.u32 %r584, 8;
shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
mov.b32 %r586, %f599;
mov.u32 %r587, 4;
shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
mov.b32 %r589, %f601;
mov.u32 %r590, 2;
shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
mov.b32 %r592, %f603;
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r595, %f763;
mov.u32 %r596, 31;
mov.u32 %r597, 16;
mov.u32 %r598, -1;
shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
mov.b32 %r600, %f607;
mov.u32 %r601, 8;
shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
mov.b32 %r603, %f609;
mov.u32 %r604, 4;
shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
mov.b32 %r606, %f611;
mov.u32 %r607, 2;
shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
mov.b32 %r609, %f613;
mov.u32 %r610, 1;
shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r612, %f761;
mov.u32 %r613, 31;
mov.u32 %r614, 16;
mov.u32 %r615, -1;
shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
mov.b32 %r617, %f618;
mov.u32 %r618, 8;
shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
mov.b32 %r620, %f620;
mov.u32 %r621, 4;
shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
mov.b32 %r623, %f622;
mov.u32 %r624, 2;
shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
mov.b32 %r626, %f624;
mov.u32 %r627, 1;
shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r629, %f765;
mov.u32 %r630, 31;
mov.u32 %r631, 16;
mov.u32 %r632, -1;
shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
mov.b32 %r634, %f628;
mov.u32 %r635, 8;
shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
mov.b32 %r637, %f630;
mov.u32 %r638, 4;
shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
mov.b32 %r640, %f632;
mov.u32 %r641, 2;
shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
mov.b32 %r643, %f634;
mov.u32 %r644, 1;
shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r151, %r127, %r764;
add.s32 %r646, %r126, %r151;
setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
add.s32 %r647, %r128, %r151;
mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r764, %r764, 1;
setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,166 +32,166 @@
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
- .reg .b32 %r<779>;
+ .reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r202, %r203}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r212, %r213}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r216, %r217}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r238, %r203, 7;
- shr.s32 %r239, %r238, 31;
- shr.u32 %r240, %r239, 29;
- add.s32 %r241, %r238, %r240;
- shr.s32 %r2, %r241, 3;
+ add.s32 %r237, %r202, 7;
+ shr.s32 %r238, %r237, 31;
+ shr.u32 %r239, %r238, 29;
+ add.s32 %r240, %r237, %r239;
+ shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
- mov.u32 %r242, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
+ mov.u32 %r241, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r243, [%rd43], %r5;
+ atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r244, %r4, %r2;
- shl.b32 %r245, %r244, 4;
- or.b32 %r246, %r245, 15;
- and.b32 %r7, %r246, -16;
- add.s32 %r247, %r246, %r7;
- and.b32 %r248, %r247, -16;
- cvt.s64.s32 %rd1, %r248;
- max.s32 %r249, %r2, %r3;
- add.s32 %r250, %r249, 31;
- shr.s32 %r251, %r250, 31;
- shr.u32 %r252, %r251, 27;
- add.s32 %r253, %r250, %r252;
- shr.u32 %r254, %r253, 5;
- mul.lo.s32 %r255, %r4, %r254;
- shl.b32 %r256, %r255, 7;
- cvt.u64.u32 %rd2, %r256;
+ mul.lo.s32 %r243, %r4, %r2;
+ shl.b32 %r244, %r243, 4;
+ or.b32 %r245, %r244, 15;
+ and.b32 %r7, %r245, -16;
+ add.s32 %r246, %r245, %r7;
+ and.b32 %r247, %r246, -16;
+ cvt.s64.s32 %rd1, %r247;
+ max.s32 %r248, %r2, %r3;
+ add.s32 %r249, %r248, 31;
+ shr.s32 %r250, %r249, 31;
+ shr.u32 %r251, %r250, 27;
+ add.s32 %r252, %r249, %r251;
+ shr.u32 %r253, %r252, 5;
+ mul.lo.s32 %r254, %r4, %r253;
+ shl.b32 %r255, %r254, 7;
+ cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r257, %r8, 7;
- setp.lt.s32 %p7, %r257, %r203;
+ or.b32 %r256, %r8, 7;
+ setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
-
-
- shl.b32 %r261, %r5, 4;
- add.s32 %r259, %r258, %r261;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
+
+
+ shl.b32 %r260, %r5, 4;
+ add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
- mov.u32 %r260, 0;
+ mov.u32 %r259, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r260, 0;
- cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r259, 0;
+ cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r735, %r6, 4;
- add.s32 %r262, %r4, 215;
- div.s32 %r263, %r262, %r4;
+ shl.b32 %r729, %r6, 4;
+ add.s32 %r261, %r4, 215;
+ div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r264, %r11, %r263;
- add.s32 %r265, %r264, -1;
- div.s32 %r12, %r265, %r11;
+ add.s32 %r263, %r11, %r262;
+ add.s32 %r264, %r263, -1;
+ div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r203;
+ cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
- mov.u32 %r267, %ctaid.y;
- mul.lo.s32 %r268, %r12, %r4;
- mul.lo.s32 %r13, %r268, %r267;
- shl.b32 %r269, %r9, 1;
- shl.b32 %r270, %r5, 4;
- mad.lo.s32 %r14, %r269, %r203, %r270;
- mul.lo.s32 %r271, %r203, %r9;
- cvt.s64.s32 %rd53, %r271;
+ mov.u32 %r266, %ctaid.y;
+ mul.lo.s32 %r267, %r12, %r4;
+ mul.lo.s32 %r13, %r267, %r266;
+ mad.lo.s32 %r268, %r2, %r9, %r5;
+ shl.b32 %r14, %r268, 4;
+ mul.lo.s32 %r269, %r202, %r9;
+ cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r272, %r13, %r203;
- cvt.s64.s32 %rd6, %r272;
+ mul.lo.s32 %r270, %r13, %r202;
+ cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- add.s32 %r15, %r271, %r8;
+ shl.b32 %r271, %r9, 3;
+ mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 2;
+ mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r275, %r274, %r16;
- shr.u32 %r17, %r5, 5;
- add.s32 %r276, %r275, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r275, %r274, %r15;
+ shr.u32 %r16, %r5, 5;
+ add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
- and.b32 %r18, %r5, 31;
- add.s32 %r277, %r275, %r18;
+ and.b32 %r17, %r5, 31;
+ add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
- mov.u32 %r734, 0;
+ mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
- add.s32 %r281, %r14, %r280;
+ add.s32 %r281, %r280, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
- add.s32 %r284, %r14, %r283;
+ add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
@@ -207,29 +207,29 @@
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r734, %r4;
- add.s32 %r278, %r23, %r9;
- add.s32 %r24, %r278, %r13;
- setp.gt.s32 %p13, %r24, 215;
+ mul.lo.s32 %r22, %r728, %r4;
+ add.s32 %r278, %r22, %r9;
+ add.s32 %r23, %r278, %r13;
+ setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
- mul.lo.s32 %r279, %r24, %r212;
+ mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p14, %r24, 216;
+ setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
- mul.lo.s32 %r286, %r23, %r203;
+ mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
@@ -255,11 +255,11 @@
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
- mul.lo.s32 %r287, %r24, %r216;
+ mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
@@ -478,23 +478,23 @@
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
- shl.b32 %r735, %r735, 2;
- bar.sync 0;
- setp.ne.s32 %p23, %r18, 0;
+ shl.b32 %r729, %r729, 2;
+ bar.sync 0;
+ setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
- setp.ne.s32 %p24, %r17, 0;
+ setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
- setp.ge.u32 %p25, %r18, %r16;
+ setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
@@ -526,11 +526,11 @@
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
- setp.ne.s32 %p242, %r18, 0;
+ setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
@@ -556,23 +556,23 @@
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
- setp.eq.s32 %p37, %r18, 0;
+ setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
- setp.ne.s32 %p240, %r17, 0;
+ setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
- setp.ge.u32 %p39, %r18, %r16;
+ setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
@@ -615,11 +615,11 @@
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
- setp.eq.s32 %p241, %r18, 0;
+ setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
@@ -633,11 +633,10 @@
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
- mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
@@ -846,13 +845,12 @@
{ cvt.rn.bf16.f32 %rs124, %f401;}
mov.b32 %r391, {%rs124, %rs128};
- add.s32 %r416, %r13, %r732;
- mad.lo.s32 %r417, %r416, %r203, %r15;
- mul.wide.s32 %rd76, %r417, 2;
+ mad.lo.s32 %r416, %r23, %r202, %r8;
+ mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
bra.uni $L__BB0_35;
@@ -862,12 +860,12 @@
{ cvt.rn.bf16.f32 %rs61, %f337;}
$L__BB0_35:
- add.s32 %r734, %r734, 1;
- setp.lt.s32 %p49, %r734, %r12;
+ add.s32 %r728, %r728, 1;
+ setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
@@ -886,68 +884,68 @@
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
- mov.u32 %r418, %tid.z;
- mad.lo.s32 %r46, %r418, %r4, %r9;
- mad.lo.s32 %r47, %r46, %r3, %r5;
- mul.wide.u32 %rd77, %r47, 4;
+ mov.u32 %r417, %tid.z;
+ mad.lo.s32 %r45, %r417, %r4, %r9;
+ mad.lo.s32 %r46, %r45, %r3, %r5;
+ mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
- clz.b32 %r419, %r4;
- mov.u32 %r420, 31;
- sub.s32 %r48, %r420, %r419;
- mov.u32 %r421, 1;
- shl.b32 %r766, %r421, %r48;
- setp.lt.u32 %p50, %r9, %r766;
- add.s32 %r422, %r766, %r9;
- setp.lt.u32 %p51, %r422, %r4;
+ clz.b32 %r418, %r4;
+ mov.u32 %r419, 31;
+ sub.s32 %r47, %r419, %r418;
+ mov.u32 %r420, 1;
+ shl.b32 %r760, %r420, %r47;
+ setp.lt.u32 %p50, %r9, %r760;
+ add.s32 %r421, %r760, %r9;
+ setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
- shl.b32 %r423, %r3, %r48;
- add.s32 %r424, %r47, %r423;
- mul.wide.s32 %rd79, %r424, 4;
+ shl.b32 %r422, %r3, %r47;
+ add.s32 %r423, %r46, %r422;
+ mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
- setp.lt.s32 %p53, %r766, 4;
+ setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
- mov.u32 %r736, %r766;
+ mov.u32 %r730, %r760;
$L__BB0_40:
- shr.u32 %r51, %r736, 1;
- setp.ge.u32 %p54, %r9, %r51;
+ shr.u32 %r50, %r730, 1;
+ setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
- mad.lo.s32 %r425, %r51, %r3, %r47;
- mul.wide.s32 %rd82, %r425, 4;
+ mad.lo.s32 %r424, %r50, %r3, %r46;
+ mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
- setp.gt.u32 %p55, %r736, 7;
- mov.u32 %r736, %r51;
+ setp.gt.u32 %p55, %r730, 7;
+ mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
- mov.u32 %r737, 0;
- add.s32 %r427, %r47, %r3;
- mul.wide.u32 %rd85, %r427, 4;
+ mov.u32 %r731, 0;
+ add.s32 %r426, %r46, %r3;
+ mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
@@ -956,54 +954,54 @@
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
- mov.b32 %r737, %f743;
+ mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
- shl.b32 %r428, %r3, %r48;
- add.s32 %r429, %r47, %r428;
- mul.wide.s32 %rd87, %r429, 4;
+ shl.b32 %r427, %r3, %r47;
+ add.s32 %r428, %r46, %r427;
+ mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
- mov.u32 %r738, %r766;
+ mov.u32 %r732, %r760;
$L__BB0_51:
- shr.u32 %r55, %r738, 1;
- setp.ge.u32 %p60, %r9, %r55;
+ shr.u32 %r54, %r732, 1;
+ setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
- mad.lo.s32 %r430, %r55, %r3, %r47;
- mul.wide.s32 %rd90, %r430, 4;
+ mad.lo.s32 %r429, %r54, %r3, %r46;
+ mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
- setp.gt.u32 %p61, %r738, 7;
- mov.u32 %r738, %r55;
+ setp.gt.u32 %p61, %r732, 7;
+ mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
- mov.u32 %r739, 0;
+ mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@@ -1011,54 +1009,54 @@
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
- mov.b32 %r739, %f744;
+ mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
- shl.b32 %r432, %r3, %r48;
- add.s32 %r433, %r47, %r432;
- mul.wide.s32 %rd93, %r433, 4;
+ shl.b32 %r431, %r3, %r47;
+ add.s32 %r432, %r46, %r431;
+ mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
- mov.u32 %r740, %r766;
+ mov.u32 %r734, %r760;
$L__BB0_62:
- shr.u32 %r59, %r740, 1;
- setp.ge.u32 %p66, %r9, %r59;
+ shr.u32 %r58, %r734, 1;
+ setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
- mad.lo.s32 %r434, %r59, %r3, %r47;
- mul.wide.s32 %rd96, %r434, 4;
+ mad.lo.s32 %r433, %r58, %r3, %r46;
+ mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
- setp.gt.u32 %p67, %r740, 7;
- mov.u32 %r740, %r59;
+ setp.gt.u32 %p67, %r734, 7;
+ mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
- mov.u32 %r741, 0;
+ mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@@ -1066,54 +1064,54 @@
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
- mov.b32 %r741, %f745;
+ mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
- shl.b32 %r436, %r3, %r48;
- add.s32 %r437, %r47, %r436;
- mul.wide.s32 %rd99, %r437, 4;
+ shl.b32 %r435, %r3, %r47;
+ add.s32 %r436, %r46, %r435;
+ mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
- mov.u32 %r742, %r766;
+ mov.u32 %r736, %r760;
$L__BB0_73:
- shr.u32 %r63, %r742, 1;
- setp.ge.u32 %p72, %r9, %r63;
+ shr.u32 %r62, %r736, 1;
+ setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
- mad.lo.s32 %r438, %r63, %r3, %r47;
- mul.wide.s32 %rd102, %r438, 4;
+ mad.lo.s32 %r437, %r62, %r3, %r46;
+ mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
- setp.gt.u32 %p73, %r742, 7;
- mov.u32 %r742, %r63;
+ setp.gt.u32 %p73, %r736, 7;
+ mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
- mov.u32 %r743, 0;
+ mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@@ -1121,54 +1119,54 @@
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
- mov.b32 %r743, %f746;
+ mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
- shl.b32 %r440, %r3, %r48;
- add.s32 %r441, %r47, %r440;
- mul.wide.s32 %rd105, %r441, 4;
+ shl.b32 %r439, %r3, %r47;
+ add.s32 %r440, %r46, %r439;
+ mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
- mov.u32 %r744, %r766;
+ mov.u32 %r738, %r760;
$L__BB0_84:
- shr.u32 %r67, %r744, 1;
- setp.ge.u32 %p78, %r9, %r67;
+ shr.u32 %r66, %r738, 1;
+ setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
- mad.lo.s32 %r442, %r67, %r3, %r47;
- mul.wide.s32 %rd108, %r442, 4;
+ mad.lo.s32 %r441, %r66, %r3, %r46;
+ mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
- setp.gt.u32 %p79, %r744, 7;
- mov.u32 %r744, %r67;
+ setp.gt.u32 %p79, %r738, 7;
+ mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
- mov.u32 %r745, 0;
+ mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@@ -1176,54 +1174,54 @@
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
- mov.b32 %r745, %f747;
+ mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
- shl.b32 %r444, %r3, %r48;
- add.s32 %r445, %r47, %r444;
- mul.wide.s32 %rd111, %r445, 4;
+ shl.b32 %r443, %r3, %r47;
+ add.s32 %r444, %r46, %r443;
+ mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
- mov.u32 %r746, %r766;
+ mov.u32 %r740, %r760;
$L__BB0_95:
- shr.u32 %r71, %r746, 1;
- setp.ge.u32 %p84, %r9, %r71;
+ shr.u32 %r70, %r740, 1;
+ setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
- mad.lo.s32 %r446, %r71, %r3, %r47;
- mul.wide.s32 %rd114, %r446, 4;
+ mad.lo.s32 %r445, %r70, %r3, %r46;
+ mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
- setp.gt.u32 %p85, %r746, 7;
- mov.u32 %r746, %r71;
+ setp.gt.u32 %p85, %r740, 7;
+ mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
- mov.u32 %r747, 0;
+ mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@@ -1231,54 +1229,54 @@
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
- mov.b32 %r747, %f748;
+ mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
- shl.b32 %r448, %r3, %r48;
- add.s32 %r449, %r47, %r448;
- mul.wide.s32 %rd117, %r449, 4;
+ shl.b32 %r447, %r3, %r47;
+ add.s32 %r448, %r46, %r447;
+ mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
- mov.u32 %r748, %r766;
+ mov.u32 %r742, %r760;
$L__BB0_106:
- shr.u32 %r75, %r748, 1;
- setp.ge.u32 %p90, %r9, %r75;
+ shr.u32 %r74, %r742, 1;
+ setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
- mad.lo.s32 %r450, %r75, %r3, %r47;
- mul.wide.s32 %rd120, %r450, 4;
+ mad.lo.s32 %r449, %r74, %r3, %r46;
+ mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
- setp.gt.u32 %p91, %r748, 7;
- mov.u32 %r748, %r75;
+ setp.gt.u32 %p91, %r742, 7;
+ mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
- mov.u32 %r749, 0;
+ mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@@ -1286,54 +1284,54 @@
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
- mov.b32 %r749, %f749;
+ mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
- shl.b32 %r452, %r3, %r48;
- add.s32 %r453, %r47, %r452;
- mul.wide.s32 %rd123, %r453, 4;
+ shl.b32 %r451, %r3, %r47;
+ add.s32 %r452, %r46, %r451;
+ mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
- mov.u32 %r750, %r766;
+ mov.u32 %r744, %r760;
$L__BB0_117:
- shr.u32 %r79, %r750, 1;
- setp.ge.u32 %p96, %r9, %r79;
+ shr.u32 %r78, %r744, 1;
+ setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
- mad.lo.s32 %r454, %r79, %r3, %r47;
- mul.wide.s32 %rd126, %r454, 4;
+ mad.lo.s32 %r453, %r78, %r3, %r46;
+ mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
- setp.gt.u32 %p97, %r750, 7;
- mov.u32 %r750, %r79;
+ setp.gt.u32 %p97, %r744, 7;
+ mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
- mov.u32 %r751, 0;
+ mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@@ -1341,55 +1339,55 @@
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
- mov.b32 %r751, %f750;
+ mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
- shl.b32 %r82, %r735, 4;
+ shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
- shl.b32 %r456, %r3, %r48;
- add.s32 %r457, %r47, %r456;
- mul.wide.s32 %rd129, %r457, 4;
+ shl.b32 %r455, %r3, %r47;
+ add.s32 %r456, %r46, %r455;
+ mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
- mov.u32 %r752, %r766;
+ mov.u32 %r746, %r760;
$L__BB0_128:
- shr.u32 %r84, %r752, 1;
- setp.ge.u32 %p102, %r9, %r84;
+ shr.u32 %r83, %r746, 1;
+ setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
- mad.lo.s32 %r458, %r84, %r3, %r47;
- mul.wide.s32 %rd132, %r458, 4;
+ mad.lo.s32 %r457, %r83, %r3, %r46;
+ mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
- setp.gt.u32 %p103, %r752, 7;
- mov.u32 %r752, %r84;
+ setp.gt.u32 %p103, %r746, 7;
+ mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
- mov.u32 %r753, 0;
+ mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@@ -1397,54 +1395,54 @@
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
- mov.b32 %r753, %f751;
+ mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
- shl.b32 %r460, %r3, %r48;
- add.s32 %r461, %r47, %r460;
- mul.wide.s32 %rd135, %r461, 4;
+ shl.b32 %r459, %r3, %r47;
+ add.s32 %r460, %r46, %r459;
+ mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
- mov.u32 %r754, %r766;
+ mov.u32 %r748, %r760;
$L__BB0_139:
- shr.u32 %r88, %r754, 1;
- setp.ge.u32 %p108, %r9, %r88;
+ shr.u32 %r87, %r748, 1;
+ setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
- mad.lo.s32 %r462, %r88, %r3, %r47;
- mul.wide.s32 %rd138, %r462, 4;
+ mad.lo.s32 %r461, %r87, %r3, %r46;
+ mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
- setp.gt.u32 %p109, %r754, 7;
- mov.u32 %r754, %r88;
+ setp.gt.u32 %p109, %r748, 7;
+ mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
- mov.u32 %r755, 0;
+ mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@@ -1452,54 +1450,54 @@
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
- mov.b32 %r755, %f752;
+ mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
- shl.b32 %r464, %r3, %r48;
- add.s32 %r465, %r47, %r464;
- mul.wide.s32 %rd141, %r465, 4;
+ shl.b32 %r463, %r3, %r47;
+ add.s32 %r464, %r46, %r463;
+ mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
- mov.u32 %r756, %r766;
+ mov.u32 %r750, %r760;
$L__BB0_150:
- shr.u32 %r92, %r756, 1;
- setp.ge.u32 %p114, %r9, %r92;
+ shr.u32 %r91, %r750, 1;
+ setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
- mad.lo.s32 %r466, %r92, %r3, %r47;
- mul.wide.s32 %rd144, %r466, 4;
+ mad.lo.s32 %r465, %r91, %r3, %r46;
+ mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
- setp.gt.u32 %p115, %r756, 7;
- mov.u32 %r756, %r92;
+ setp.gt.u32 %p115, %r750, 7;
+ mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
- mov.u32 %r757, 0;
+ mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@@ -1507,54 +1505,54 @@
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
- mov.b32 %r757, %f753;
+ mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
- shl.b32 %r468, %r3, %r48;
- add.s32 %r469, %r47, %r468;
- mul.wide.s32 %rd147, %r469, 4;
+ shl.b32 %r467, %r3, %r47;
+ add.s32 %r468, %r46, %r467;
+ mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
- mov.u32 %r758, %r766;
+ mov.u32 %r752, %r760;
$L__BB0_161:
- shr.u32 %r96, %r758, 1;
- setp.ge.u32 %p120, %r9, %r96;
+ shr.u32 %r95, %r752, 1;
+ setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
- mad.lo.s32 %r470, %r96, %r3, %r47;
- mul.wide.s32 %rd150, %r470, 4;
+ mad.lo.s32 %r469, %r95, %r3, %r46;
+ mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
- setp.gt.u32 %p121, %r758, 7;
- mov.u32 %r758, %r96;
+ setp.gt.u32 %p121, %r752, 7;
+ mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
- mov.u32 %r759, 0;
+ mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@@ -1562,54 +1560,54 @@
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
- mov.b32 %r759, %f754;
+ mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
- shl.b32 %r472, %r3, %r48;
- add.s32 %r473, %r47, %r472;
- mul.wide.s32 %rd153, %r473, 4;
+ shl.b32 %r471, %r3, %r47;
+ add.s32 %r472, %r46, %r471;
+ mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
- mov.u32 %r760, %r766;
+ mov.u32 %r754, %r760;
$L__BB0_172:
- shr.u32 %r100, %r760, 1;
- setp.ge.u32 %p126, %r9, %r100;
+ shr.u32 %r99, %r754, 1;
+ setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
- mad.lo.s32 %r474, %r100, %r3, %r47;
- mul.wide.s32 %rd156, %r474, 4;
+ mad.lo.s32 %r473, %r99, %r3, %r46;
+ mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
- setp.gt.u32 %p127, %r760, 7;
- mov.u32 %r760, %r100;
+ setp.gt.u32 %p127, %r754, 7;
+ mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
- mov.u32 %r761, 0;
+ mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@@ -1617,54 +1615,54 @@
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
- mov.b32 %r761, %f755;
+ mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
- shl.b32 %r476, %r3, %r48;
- add.s32 %r477, %r47, %r476;
- mul.wide.s32 %rd159, %r477, 4;
+ shl.b32 %r475, %r3, %r47;
+ add.s32 %r476, %r46, %r475;
+ mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
- mov.u32 %r762, %r766;
+ mov.u32 %r756, %r760;
$L__BB0_183:
- shr.u32 %r104, %r762, 1;
- setp.ge.u32 %p132, %r9, %r104;
+ shr.u32 %r103, %r756, 1;
+ setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
- mad.lo.s32 %r478, %r104, %r3, %r47;
- mul.wide.s32 %rd162, %r478, 4;
+ mad.lo.s32 %r477, %r103, %r3, %r46;
+ mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
- setp.gt.u32 %p133, %r762, 7;
- mov.u32 %r762, %r104;
+ setp.gt.u32 %p133, %r756, 7;
+ mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
- mov.u32 %r763, 0;
+ mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@@ -1672,54 +1670,54 @@
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
- mov.b32 %r763, %f756;
+ mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
- shl.b32 %r480, %r3, %r48;
- add.s32 %r481, %r47, %r480;
- mul.wide.s32 %rd165, %r481, 4;
+ shl.b32 %r479, %r3, %r47;
+ add.s32 %r480, %r46, %r479;
+ mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
- mov.u32 %r764, %r766;
+ mov.u32 %r758, %r760;
$L__BB0_194:
- shr.u32 %r108, %r764, 1;
- setp.ge.u32 %p138, %r9, %r108;
+ shr.u32 %r107, %r758, 1;
+ setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
- mad.lo.s32 %r482, %r108, %r3, %r47;
- mul.wide.s32 %rd168, %r482, 4;
+ mad.lo.s32 %r481, %r107, %r3, %r46;
+ mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
- setp.gt.u32 %p139, %r764, 7;
- mov.u32 %r764, %r108;
+ setp.gt.u32 %p139, %r758, 7;
+ mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
- mov.u32 %r765, 0;
+ mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@@ -1727,21 +1725,21 @@
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
- mov.b32 %r765, %f757;
+ mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
- shl.b32 %r484, %r3, %r48;
- add.s32 %r485, %r47, %r484;
- mul.wide.s32 %rd171, %r485, 4;
+ shl.b32 %r483, %r3, %r47;
+ add.s32 %r484, %r46, %r483;
+ mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
@@ -1749,30 +1747,30 @@
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
- shr.u32 %r112, %r766, 1;
- setp.ge.u32 %p144, %r9, %r112;
+ shr.u32 %r111, %r760, 1;
+ setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
- mad.lo.s32 %r486, %r112, %r3, %r47;
- mul.wide.s32 %rd174, %r486, 4;
+ mad.lo.s32 %r485, %r111, %r3, %r46;
+ mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
- setp.gt.u32 %p145, %r766, 7;
- mov.u32 %r766, %r112;
+ setp.gt.u32 %p145, %r760, 7;
+ mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
- mov.u32 %r767, 0;
+ mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@@ -1780,420 +1778,416 @@
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
- mov.b32 %r767, %f758;
+ mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
- shl.b32 %r731, %r5, 3;
- mov.u32 %r512, %ctaid.y;
- mad.lo.s32 %r513, %r203, %r512, %r731;
- add.s32 %r514, %r513, %r82;
- mul.wide.s32 %rd183, %r514, 4;
+ mov.u32 %r511, %ctaid.y;
+ mad.lo.s32 %r512, %r202, %r511, %r8;
+ add.s32 %r513, %r512, %r81;
+ mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
- st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
-
- add.s32 %r515, %r514, 4;
- mul.wide.s32 %rd184, %r515, 4;
+ st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
+
+ add.s32 %r514, %r513, 4;
+ mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
- st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
bra.uni $L__BB0_218;
$L__BB0_212:
- shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
- add.s32 %r488, %r728, 3;
- sub.s32 %r115, %r488, %r203;
- mov.u32 %r489, %ctaid.y;
- mad.lo.s32 %r116, %r203, %r489, %r728;
- neg.s32 %r490, %r82;
- setp.ge.s32 %p151, %r115, %r490;
+ add.s32 %r487, %r8, 3;
+ sub.s32 %r114, %r487, %r202;
+ mov.u32 %r488, %ctaid.y;
+ mad.lo.s32 %r115, %r202, %r488, %r8;
+ neg.s32 %r489, %r81;
+ setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
- add.s32 %r495, %r116, %r82;
- mul.wide.s32 %rd178, %r495, 4;
+ add.s32 %r494, %r115, %r81;
+ mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
- st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
+ st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
$L__BB0_214:
- mov.u32 %r496, -4;
- sub.s32 %r497, %r496, %r82;
- setp.ge.s32 %p153, %r115, %r497;
+ mov.u32 %r495, -4;
+ sub.s32 %r496, %r495, %r81;
+ setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
- add.s32 %r502, %r116, %r82;
- add.s32 %r503, %r502, 4;
- mul.wide.s32 %rd180, %r503, 4;
+ add.s32 %r501, %r115, %r81;
+ add.s32 %r502, %r501, 4;
+ mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
- st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
- shl.b32 %r730, %r5, 3;
- shl.b32 %r540, %r735, 5;
- mov.u32 %r541, %ctaid.y;
- mad.lo.s32 %r542, %r203, %r541, %r730;
- add.s32 %r543, %r542, %r540;
- mul.wide.s32 %rd191, %r543, 4;
+ shl.b32 %r539, %r729, 5;
+ mov.u32 %r540, %ctaid.y;
+ mad.lo.s32 %r541, %r202, %r540, %r8;
+ add.s32 %r542, %r541, %r539;
+ mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
- st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
-
- add.s32 %r544, %r543, 4;
- mul.wide.s32 %rd192, %r544, 4;
+ st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
+
+ add.s32 %r543, %r542, 4;
+ mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
- st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
bra.uni $L__BB0_227;
$L__BB0_219:
- shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
- add.s32 %r516, %r729, 3;
- sub.s32 %r117, %r516, %r203;
- mov.u32 %r517, %ctaid.y;
- mad.lo.s32 %r118, %r203, %r517, %r729;
+ add.s32 %r515, %r8, 3;
+ sub.s32 %r116, %r515, %r202;
+ mov.u32 %r516, %ctaid.y;
+ mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
- shl.b32 %r119, %r735, 5;
- neg.s32 %r518, %r119;
- setp.ge.s32 %p160, %r117, %r518;
+ shl.b32 %r118, %r729, 5;
+ neg.s32 %r517, %r118;
+ setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
- add.s32 %r523, %r118, %r119;
- mul.wide.s32 %rd186, %r523, 4;
+ add.s32 %r522, %r117, %r118;
+ mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
+ st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
$L__BB0_222:
@%p159 bra $L__BB0_227;
- shl.b32 %r120, %r735, 5;
- mov.u32 %r524, -4;
- sub.s32 %r525, %r524, %r120;
- setp.ge.s32 %p162, %r117, %r525;
+ shl.b32 %r119, %r729, 5;
+ mov.u32 %r523, -4;
+ sub.s32 %r524, %r523, %r119;
+ setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
- add.s32 %r530, %r118, %r120;
- add.s32 %r531, %r530, 4;
- mul.wide.s32 %rd188, %r531, 4;
+ add.s32 %r529, %r117, %r119;
+ add.s32 %r530, %r529, 4;
+ mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
- st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
$L__BB0_227:
- mov.u32 %r121, %ctaid.y;
+ mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r545, %r5, %r9;
- or.b32 %r547, %r545, %r418;
- setp.ne.s32 %p164, %r547, 0;
+ or.b32 %r544, %r5, %r9;
+ or.b32 %r546, %r544, %r417;
+ setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
- mov.u32 %r548, %ctaid.x;
- mov.u32 %r549, %ctaid.z;
- mov.u32 %r550, %nctaid.x;
- mad.lo.s32 %r551, %r549, %r550, %r548;
- mul.wide.s32 %rd194, %r551, 8;
+ mov.u32 %r547, %ctaid.x;
+ mov.u32 %r548, %ctaid.z;
+ mov.u32 %r549, %nctaid.x;
+ mad.lo.s32 %r550, %r548, %r549, %r547;
+ mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
- add.s32 %r552, %r11, -1;
- setp.eq.s32 %p165, %r121, %r552;
+ add.s32 %r551, %r11, -1;
+ setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
- mov.u32 %r768, 8;
+ mov.u32 %r762, 8;
$L__BB0_230:
- nanosleep.u32 %r768;
-
- setp.lt.u32 %p167, %r768, 256;
- selp.u32 %r555, 1, 0, %p167;
- shl.b32 %r768, %r768, %r555;
+ nanosleep.u32 %r762;
+
+ setp.lt.u32 %p167, %r762, 256;
+ selp.u32 %r554, 1, 0, %p167;
+ shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
- add.s32 %r557, %r203, 1;
- shr.u32 %r558, %r557, 31;
- add.s32 %r559, %r557, %r558;
- shr.s32 %r560, %r559, 1;
- add.s32 %r561, %r4, %r560;
- add.s32 %r562, %r561, -1;
- div.s32 %r563, %r562, %r4;
- add.s32 %r564, %r11, -1;
- add.s32 %r565, %r564, %r563;
- div.s32 %r124, %r565, %r11;
- add.s32 %r125, %r564, %r3;
- shl.b32 %r126, %r9, 1;
- shl.b32 %r566, %r4, 1;
- mad.lo.s32 %r129, %r566, %r121, %r126;
- or.b32 %r127, %r129, 1;
- mul.lo.s32 %r128, %r566, %r11;
- shr.u32 %r130, %r3, 5;
- mul.lo.s32 %r567, %r46, %r130;
- shr.u32 %r131, %r5, 5;
- add.s32 %r568, %r567, %r131;
- mul.wide.u32 %rd203, %r568, 4;
+ add.s32 %r556, %r202, 1;
+ shr.u32 %r557, %r556, 31;
+ add.s32 %r558, %r556, %r557;
+ shr.s32 %r559, %r558, 1;
+ add.s32 %r560, %r4, %r559;
+ add.s32 %r561, %r560, -1;
+ div.s32 %r562, %r561, %r4;
+ add.s32 %r563, %r11, -1;
+ add.s32 %r564, %r563, %r562;
+ div.s32 %r123, %r564, %r11;
+ add.s32 %r124, %r563, %r3;
+ shl.b32 %r125, %r9, 1;
+ shl.b32 %r565, %r4, 1;
+ mad.lo.s32 %r128, %r565, %r120, %r125;
+ or.b32 %r126, %r128, 1;
+ mul.lo.s32 %r127, %r565, %r11;
+ shr.u32 %r129, %r3, 5;
+ mul.lo.s32 %r566, %r45, %r129;
+ shr.u32 %r130, %r5, 5;
+ add.s32 %r567, %r566, %r130;
+ mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
- and.b32 %r132, %r5, 31;
- add.s32 %r569, %r567, %r132;
- mul.wide.u32 %rd205, %r569, 4;
+ and.b32 %r131, %r5, 31;
+ add.s32 %r568, %r566, %r131;
+ mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
- mov.u32 %r769, 0;
+ mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
- add.s32 %r769, %r769, 1;
+ add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
- setp.lt.s32 %p169, %r769, %r124;
+ setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
- div.s32 %r154, %r125, %r3;
- setp.lt.s32 %p206, %r154, 1;
+ div.s32 %r153, %r124, %r3;
+ setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
- mul.lo.s32 %r650, %r128, %r769;
- add.s32 %r155, %r127, %r650;
- add.s32 %r156, %r129, %r650;
- mov.u32 %r649, 0;
+ mul.lo.s32 %r649, %r127, %r763;
+ add.s32 %r154, %r126, %r649;
+ add.s32 %r155, %r128, %r649;
+ mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
- mov.u32 %r776, %r649;
+ mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
- setp.ge.s32 %p207, %r155, %r203;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ setp.ge.s32 %p207, %r154, %r202;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
- mad.lo.s32 %r158, %r776, %r3, %r5;
- setp.ge.s32 %p208, %r158, %r11;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ mad.lo.s32 %r157, %r770, %r3, %r5;
+ setp.ge.s32 %p208, %r157, %r11;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
- mad.lo.s32 %r657, %r158, %r203, %r156;
- mul.wide.s32 %rd211, %r657, 4;
+ mad.lo.s32 %r656, %r157, %r202, %r155;
+ mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
- ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
+ ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
$L__BB0_263:
- mov.b32 %f642, %r778;
+ mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
- mov.b32 %f643, %r777;
+ mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
- add.s32 %r776, %r776, 1;
- setp.lt.s32 %p209, %r776, %r154;
+ add.s32 %r770, %r770, 1;
+ setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
- mov.b32 %r658, %f770;
- mov.u32 %r659, 31;
- mov.u32 %r660, 16;
- mov.u32 %r661, -1;
- shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
- mov.b32 %f644, %r662;
+ mov.b32 %r657, %f770;
+ mov.u32 %r658, 31;
+ mov.u32 %r659, 16;
+ mov.u32 %r660, -1;
+ shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
+ mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
- mov.b32 %r663, %f645;
- mov.u32 %r664, 8;
- shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
- mov.b32 %f646, %r665;
+ mov.b32 %r662, %f645;
+ mov.u32 %r663, 8;
+ shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
+ mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
- mov.b32 %r666, %f647;
- mov.u32 %r667, 4;
- shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
- mov.b32 %f648, %r668;
+ mov.b32 %r665, %f647;
+ mov.u32 %r666, 4;
+ shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
+ mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
- mov.b32 %r669, %f649;
- mov.u32 %r670, 2;
- shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
- mov.b32 %f650, %r671;
+ mov.b32 %r668, %f649;
+ mov.u32 %r669, 2;
+ shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
+ mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
- mov.b32 %r672, %f651;
- mov.u32 %r673, 1;
- shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
- mov.b32 %f652, %r674;
+ mov.b32 %r671, %f651;
+ mov.u32 %r672, 1;
+ shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
+ mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
- setp.ne.s32 %p215, %r132, 0;
+ setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
- setp.ne.s32 %p216, %r131, 0;
+ setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
- setp.ge.u32 %p217, %r132, %r130;
+ setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
- mov.b32 %r675, %f771;
- mov.u32 %r676, 31;
- mov.u32 %r677, 16;
- mov.u32 %r678, -1;
- shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
- mov.b32 %f654, %r679;
+ mov.b32 %r674, %f771;
+ mov.u32 %r675, 31;
+ mov.u32 %r676, 16;
+ mov.u32 %r677, -1;
+ shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
+ mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
- mov.b32 %r680, %f655;
- mov.u32 %r681, 8;
- shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
- mov.b32 %f656, %r682;
+ mov.b32 %r679, %f655;
+ mov.u32 %r680, 8;
+ shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
+ mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
- mov.b32 %r683, %f657;
- mov.u32 %r684, 4;
- shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
- mov.b32 %f658, %r685;
+ mov.b32 %r682, %f657;
+ mov.u32 %r683, 4;
+ shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
+ mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
- mov.b32 %r686, %f659;
- mov.u32 %r687, 2;
- shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
- mov.b32 %f660, %r688;
+ mov.b32 %r685, %f659;
+ mov.u32 %r686, 2;
+ shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
+ mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
- mov.b32 %r689, %f661;
- mov.u32 %r690, 1;
- shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
- mov.b32 %f662, %r691;
+ mov.b32 %r688, %f661;
+ mov.u32 %r689, 1;
+ shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
+ mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
- setp.eq.s32 %p224, %r132, 0;
+ setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
{ cvt.rn.bf16.f32 %rs131, %f663;}
- mov.b32 %r692, %f769;
- mov.u32 %r693, 31;
- mov.u32 %r694, 16;
- mov.u32 %r695, -1;
- shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
- mov.b32 %f665, %r696;
+ mov.b32 %r691, %f769;
+ mov.u32 %r692, 31;
+ mov.u32 %r693, 16;
+ mov.u32 %r694, -1;
+ shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
+ mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
- mov.b32 %r697, %f666;
- mov.u32 %r698, 8;
- shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
- mov.b32 %f667, %r699;
+ mov.b32 %r696, %f666;
+ mov.u32 %r697, 8;
+ shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
+ mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
- mov.b32 %r700, %f668;
- mov.u32 %r701, 4;
- shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
- mov.b32 %f669, %r702;
+ mov.b32 %r699, %f668;
+ mov.u32 %r700, 4;
+ shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
+ mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
- mov.b32 %r703, %f670;
- mov.u32 %r704, 2;
- shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
- mov.b32 %f671, %r705;
+ mov.b32 %r702, %f670;
+ mov.u32 %r703, 2;
+ shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
+ mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
- mov.b32 %r706, %f672;
- mov.u32 %r707, 1;
- shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
- mov.b32 %f673, %r708;
+ mov.b32 %r705, %f672;
+ mov.u32 %r706, 1;
+ shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
+ mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
- setp.ge.u32 %p231, %r132, %r130;
+ setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
- mov.b32 %r709, %f773;
- mov.u32 %r710, 31;
- mov.u32 %r711, 16;
- mov.u32 %r712, -1;
- shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
- mov.b32 %f675, %r713;
+ mov.b32 %r708, %f773;
+ mov.u32 %r709, 31;
+ mov.u32 %r710, 16;
+ mov.u32 %r711, -1;
+ shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
+ mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
- mov.b32 %r714, %f676;
- mov.u32 %r715, 8;
- shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
- mov.b32 %f677, %r716;
+ mov.b32 %r713, %f676;
+ mov.u32 %r714, 8;
+ shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
+ mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
- mov.b32 %r717, %f678;
- mov.u32 %r718, 4;
- shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
- mov.b32 %f679, %r719;
+ mov.b32 %r716, %f678;
+ mov.u32 %r717, 4;
+ shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
+ mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
- mov.b32 %r720, %f680;
- mov.u32 %r721, 2;
- shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
- mov.b32 %f681, %r722;
+ mov.b32 %r719, %f680;
+ mov.u32 %r720, 2;
+ shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
+ mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
- mov.b32 %r723, %f682;
- mov.u32 %r724, 1;
- shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
- mov.b32 %f683, %r725;
+ mov.b32 %r722, %f682;
+ mov.u32 %r723, 1;
+ shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
+ mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
@@ -2202,228 +2196,228 @@
{ cvt.rn.bf16.f32 %rs132, %f684;}
@%p6 bra $L__BB0_279;
- mul.lo.s32 %r164, %r128, %r769;
- add.s32 %r726, %r127, %r164;
- setp.ge.s32 %p239, %r726, %r203;
+ mul.lo.s32 %r163, %r127, %r763;
+ add.s32 %r725, %r126, %r163;
+ setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
- add.s32 %r727, %r129, %r164;
- mul.wide.s32 %rd212, %r727, 2;
+ add.s32 %r726, %r128, %r163;
+ mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
- setp.lt.s32 %p170, %r124, 1;
+ setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
- div.s32 %r134, %r125, %r3;
- mad.lo.s32 %r135, %r203, %r5, %r126;
- shl.b32 %r136, %r121, 1;
- shl.b32 %r137, %r11, 1;
- mul.lo.s32 %r138, %r203, %r3;
- mov.u32 %r770, 0;
+ div.s32 %r133, %r124, %r3;
+ mad.lo.s32 %r134, %r202, %r5, %r125;
+ shl.b32 %r135, %r120, 1;
+ shl.b32 %r136, %r11, 1;
+ mul.lo.s32 %r137, %r202, %r3;
+ mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
- setp.lt.s32 %p171, %r134, 1;
+ setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
- mad.lo.s32 %r140, %r128, %r770, %r127;
- mad.lo.s32 %r572, %r137, %r770, %r136;
- mad.lo.s32 %r772, %r4, %r572, %r135;
- mov.u32 %r571, 0;
+ mad.lo.s32 %r139, %r127, %r764, %r126;
+ mad.lo.s32 %r571, %r136, %r764, %r135;
+ mad.lo.s32 %r766, %r4, %r571, %r134;
+ mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
- mov.u32 %r771, %r5;
- mov.u32 %r773, %r571;
+ mov.u32 %r765, %r5;
+ mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
- setp.ge.s32 %p172, %r140, %r203;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p172, %r139, %r202;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
- setp.ge.s32 %p173, %r771, %r11;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p173, %r765, %r11;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
- mul.wide.s32 %rd207, %r772, 4;
+ mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
- ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
+ ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
$L__BB0_240:
- mov.b32 %f594, %r775;
+ mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
- mov.b32 %f595, %r774;
+ mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
- add.s32 %r772, %r772, %r138;
- add.s32 %r771, %r771, %r3;
- add.s32 %r773, %r773, 1;
- setp.lt.s32 %p174, %r773, %r134;
+ add.s32 %r766, %r766, %r137;
+ add.s32 %r765, %r765, %r3;
+ add.s32 %r767, %r767, 1;
+ setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
- mov.b32 %r579, %f762;
- mov.u32 %r580, 31;
- mov.u32 %r581, 16;
- mov.u32 %r582, -1;
- shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
- mov.b32 %f596, %r583;
+ mov.b32 %r578, %f762;
+ mov.u32 %r579, 31;
+ mov.u32 %r580, 16;
+ mov.u32 %r581, -1;
+ shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
+ mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
- mov.b32 %r584, %f597;
- mov.u32 %r585, 8;
- shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
- mov.b32 %f598, %r586;
+ mov.b32 %r583, %f597;
+ mov.u32 %r584, 8;
+ shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
+ mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
- mov.b32 %r587, %f599;
- mov.u32 %r588, 4;
- shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
- mov.b32 %f600, %r589;
+ mov.b32 %r586, %f599;
+ mov.u32 %r587, 4;
+ shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
+ mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
- mov.b32 %r590, %f601;
- mov.u32 %r591, 2;
- shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
- mov.b32 %f602, %r592;
+ mov.b32 %r589, %f601;
+ mov.u32 %r590, 2;
+ shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
+ mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
- mov.b32 %r593, %f603;
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
- mov.b32 %f604, %r595;
+ mov.b32 %r592, %f603;
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
+ mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
- setp.ne.s32 %p180, %r132, 0;
+ setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
- setp.ne.s32 %p181, %r131, 0;
+ setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
- setp.ge.u32 %p182, %r132, %r130;
+ setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
- mov.b32 %r596, %f763;
- mov.u32 %r597, 31;
- mov.u32 %r598, 16;
- mov.u32 %r599, -1;
- shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
- mov.b32 %f606, %r600;
+ mov.b32 %r595, %f763;
+ mov.u32 %r596, 31;
+ mov.u32 %r597, 16;
+ mov.u32 %r598, -1;
+ shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
+ mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
- mov.b32 %r601, %f607;
- mov.u32 %r602, 8;
- shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
- mov.b32 %f608, %r603;
+ mov.b32 %r600, %f607;
+ mov.u32 %r601, 8;
+ shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
+ mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
- mov.b32 %r604, %f609;
- mov.u32 %r605, 4;
- shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
- mov.b32 %f610, %r606;
+ mov.b32 %r603, %f609;
+ mov.u32 %r604, 4;
+ shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
+ mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
- mov.b32 %r607, %f611;
- mov.u32 %r608, 2;
- shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
- mov.b32 %f612, %r609;
+ mov.b32 %r606, %f611;
+ mov.u32 %r607, 2;
+ shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
+ mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
- mov.b32 %r610, %f613;
- mov.u32 %r611, 1;
- shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
- mov.b32 %f614, %r612;
+ mov.b32 %r609, %f613;
+ mov.u32 %r610, 1;
+ shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
+ mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
- setp.eq.s32 %p189, %r132, 0;
+ setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
{ cvt.rn.bf16.f32 %rs129, %f615;}
- mov.b32 %r613, %f761;
- mov.u32 %r614, 31;
- mov.u32 %r615, 16;
- mov.u32 %r616, -1;
- shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
- mov.b32 %f617, %r617;
+ mov.b32 %r612, %f761;
+ mov.u32 %r613, 31;
+ mov.u32 %r614, 16;
+ mov.u32 %r615, -1;
+ shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
+ mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
- mov.b32 %r618, %f618;
- mov.u32 %r619, 8;
- shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
- mov.b32 %f619, %r620;
+ mov.b32 %r617, %f618;
+ mov.u32 %r618, 8;
+ shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
+ mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
- mov.b32 %r621, %f620;
- mov.u32 %r622, 4;
- shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
- mov.b32 %f621, %r623;
+ mov.b32 %r620, %f620;
+ mov.u32 %r621, 4;
+ shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
+ mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
- mov.b32 %r624, %f622;
- mov.u32 %r625, 2;
- shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
- mov.b32 %f623, %r626;
+ mov.b32 %r623, %f622;
+ mov.u32 %r624, 2;
+ shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
+ mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
- mov.b32 %r627, %f624;
- mov.u32 %r628, 1;
- shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
- mov.b32 %f625, %r629;
+ mov.b32 %r626, %f624;
+ mov.u32 %r627, 1;
+ shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
+ mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
- setp.ge.u32 %p196, %r132, %r130;
+ setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
- mov.b32 %r630, %f765;
- mov.u32 %r631, 31;
- mov.u32 %r632, 16;
- mov.u32 %r633, -1;
- shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
- mov.b32 %f627, %r634;
+ mov.b32 %r629, %f765;
+ mov.u32 %r630, 31;
+ mov.u32 %r631, 16;
+ mov.u32 %r632, -1;
+ shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
+ mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
- mov.b32 %r635, %f628;
- mov.u32 %r636, 8;
- shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
- mov.b32 %f629, %r637;
+ mov.b32 %r634, %f628;
+ mov.u32 %r635, 8;
+ shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
+ mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
- mov.b32 %r638, %f630;
- mov.u32 %r639, 4;
- shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
- mov.b32 %f631, %r640;
+ mov.b32 %r637, %f630;
+ mov.u32 %r638, 4;
+ shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
+ mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
- mov.b32 %r641, %f632;
- mov.u32 %r642, 2;
- shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
- mov.b32 %f633, %r643;
+ mov.b32 %r640, %f632;
+ mov.u32 %r641, 2;
+ shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
+ mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
- mov.b32 %r644, %f634;
- mov.u32 %r645, 1;
- shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
- mov.b32 %f635, %r646;
+ mov.b32 %r643, %f634;
+ mov.u32 %r644, 1;
+ shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
+ mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
@@ -2432,23 +2426,23 @@
{ cvt.rn.bf16.f32 %rs130, %f636;}
@%p6 bra $L__BB0_256;
- mul.lo.s32 %r152, %r128, %r770;
- add.s32 %r647, %r127, %r152;
- setp.ge.s32 %p204, %r647, %r203;
+ mul.lo.s32 %r151, %r127, %r764;
+ add.s32 %r646, %r126, %r151;
+ setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
- add.s32 %r648, %r129, %r152;
- mul.wide.s32 %rd208, %r648, 2;
+ add.s32 %r647, %r128, %r151;
+ mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
- add.s32 %r770, %r770, 1;
- setp.lt.s32 %p205, %r770, %r124;
+ add.s32 %r764, %r764, 1;
+ setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
23: CombinedSchedulerTest.LayerNormBackward/dtype___bfloat_batch_216_hidden_1024
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-10
+10 index type: int
registers: 60→ 56
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
__global__ void nvfuser_N(Tensor<__bfloat, 2, 2> T0, Tensor<__bfloat, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<__bfloat, 1, 1> T4, Tensor<__bfloat, 1, 1> T5, Tensor<__bfloat, 2, 2> T28, Tensor<__bfloat, 1, 1> T30, Tensor<__bfloat, 1, 1> T29, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<int64_t, 1, 1> T66) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize(((ceilDiv((max((ceilDiv(T0.logical_size[1LL], 8)), ((nvfuser_index_t)blockDim.x))), 32)) * 32) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
NVFUSER_DEFINE_MAGIC_ZERO;
__bfloat* T44 = reinterpret_cast<__bfloat*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2)) + 15) & -16));
__bfloat* T41 = reinterpret_cast<__bfloat*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 8))) * 8) * 2) + 15) & -16));
__bfloat* T40 = reinterpret_cast<__bfloat*>(array + smem_offset + 0);
Tensor<__bfloat, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T44) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (8 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
__syncthreads();
Array<float, 8, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
T57[i6] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
T62[i7] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T60;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T60[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<float, 8, 1> T55;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T55[i8] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i9 = 0; i9 < (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i9) {
Array<float, 1, 1> T42;
T42[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T42[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i9))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T43;
T43[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216)) {
T43[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i9))];
}
}
Array<float, 1, 1> T23;
T23[0]
= (float) d5
* T43[0];
Array<float, 1, 1> T15;
T15[0] = 0.000000000e+00f;
Array<float, 1, 1> T18;
T18[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
T60[i8]
= T60[i8]
+ T6[0];
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
T54[0]
= T54[0]
+ T13[0];
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
T55[i8]
= T55[i8]
+ T25[0];
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
T65[0]
= T65[0]
+ T17[0];
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T60[i8]
= T60[i8]
+ T6[0];
}
Array<float, 1, 1> T8;
T8[0]
= __bfloat2float(T51[i8]);
Array<float, 1, 1> T12;
T12[0]
= T8[0];
Array<float, 1, 1> T13;
T13[0]
= T6[0]
* T12[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T54[0]
= T54[0]
+ T13[0];
}
Array<float, 1, 1> T7;
T7[0]
= __bfloat2float(T53[i8]);
Array<float, 1, 1> T10;
T10[0]
= T7[0]
- T42[0];
Array<float, 1, 1> T11;
T11[0]
= T10[0]
* T43[0];
Array<float, 1, 1> T25;
T25[0]
= T6[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T55[i8]
= T55[i8]
+ T25[0];
}
Array<float, 1, 1> T17;
T17[0]
= T13[0]
* T11[0];
if (((((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
T65[0]
= T65[0]
+ T17[0];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
warp::warpReduceTIDX<false, true>(T15[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
warp::warpReduceTIDX<false, true>(T18[0], T65[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T16;
broadcast::blockBroadcast<true, false, false, true>(T16[0], T15[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
T32[0]
= __bfloat2float(T50[i10]);
Array<float, 1, 1> T33;
T33[0]
= T32[0];
Array<float, 1, 1> T31;
T31[0]
= __bfloat2float(T48[i10]);
Array<float, 1, 1> T34;
T34[0]
= T31[0]
* T33[0];
Array<float, 1, 1> T14;
T14[0]
= (float) d4
* T34[0];
Array<float, 1, 1> T36;
T36[0]
= __bfloat2float(T52[i10]);
Array<float, 1, 1> T37;
T37[0]
= T36[0]
- T42[0];
Array<float, 1, 1> T38;
T38[0]
= T37[0]
* T43[0];
Array<float, 1, 1> T21;
T21[0]
= T14[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T38[0]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T45[i10]
= __float2bfloat(T24[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 8; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 8; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i11 + nvfuser_zero)))], &T57[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
if ((((((nvfuser_index_t)threadIdx.y) == 0) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8)))) && (((3 - i2) + (8 * ((nvfuser_index_t)threadIdx.x))) < (-(4 * (i12 + nvfuser_zero)))))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[(((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y))) + (4 * (i12 + nvfuser_zero)))], &T62[(4 * i12)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
// Allocate global tensor T66
grid_sync::sync<false, true, false, true, true>(T66[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i13) {
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i15 = 0; i15 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i15) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i15)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) + ((((nvfuser_index_t)blockDim.x) * i2) * i15))]);
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T59[i14]
= T59[i14]
+ T58[i14];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T46;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
Array<float, 1, 1> T26;
T26[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T26[0], T59[i16], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T46[i16]
= __float2bfloat(T26[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T30[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i13))], &T46[0]);
}
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv((ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i17) {
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18] = 0.000000000e+00f;
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll 1
for(nvfuser_index_t i19 = 0; i19 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i19) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i19)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[(((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) + ((((nvfuser_index_t)blockDim.x) * i2) * i19))]);
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T64[i18]
= T64[i18]
+ T63[i18];
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
Array<__bfloat, 2, 2> T47;
#pragma unroll
for(nvfuser_index_t i20 = 0; i20 < 2; ++i20) {
Array<float, 1, 1> T27;
T27[0] = 0.000000000e+00f;
warp::warpReduceTIDX<false, true>(T27[0], T64[i20], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, static_cast<float>(0.000000000e+00f), DefaultBlockDim());
T47[i20]
= __float2bfloat(T27[0]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) == 0) && ((((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17)) < i2))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/2, /*is_volatile=*/false>( &T29[(((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + (((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)gridDim.y)) * i17))], &T47[0]);
}
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -83,11 +83,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T41) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -97,11 +97,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((2 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T40) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (8 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -130,17 +130,17 @@
Array<float, 1, 1> T65;
T65[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
= __bfloat2float(T49[i8]);
@@ -189,21 +189,21 @@
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
Array<__bfloat, 8, 8> T53;
T53.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T53[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T51;
T51.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T51[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T49;
T49.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T49[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
Array<float, 1, 1> T6;
T6[0]
@@ -267,17 +267,17 @@
Array<float, 1, 1> T19;
broadcast::blockBroadcast<true, false, false, true>(T19[0], T18[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
Array<float, 1, 1> T32;
@@ -331,21 +331,21 @@
loadLocalToGlobal<__bfloat, /*vec_size=*/8, /*is_volatile=*/false>( &T28[((((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i9))], &T45[0]);
} else {
Array<__bfloat, 8, 8> T52;
T52.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T52[0], &T41[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<__bfloat, 8, 8> T50;
T50.set(__bfloat(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<__bfloat, 8>( &T50[0], &T44[(8 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<__bfloat, 8, 8> T48;
T48.set(__bfloat(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 8))) && ((7 + (8 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(216, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i9)) < 216))) {
- loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<__bfloat, 8>( &T48[0], &T40[((8 * ((nvfuser_index_t)threadIdx.x)) + ((8 * (ceilDiv(i2, 8))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T45 = T48;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 8; ++i10) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<779>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r202, %r203}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r212, %r213}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r216, %r217}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r238, %r203, 7;
shr.s32 %r239, %r238, 31;
shr.u32 %r240, %r239, 29;
add.s32 %r241, %r238, %r240;
shr.s32 %r2, %r241, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r242, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r243, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r244, %r4, %r2;
shl.b32 %r245, %r244, 4;
or.b32 %r246, %r245, 15;
and.b32 %r7, %r246, -16;
add.s32 %r247, %r246, %r7;
and.b32 %r248, %r247, -16;
cvt.s64.s32 %rd1, %r248;
max.s32 %r249, %r2, %r3;
add.s32 %r250, %r249, 31;
shr.s32 %r251, %r250, 31;
shr.u32 %r252, %r251, 27;
add.s32 %r253, %r250, %r252;
shr.u32 %r254, %r253, 5;
mul.lo.s32 %r255, %r4, %r254;
shl.b32 %r256, %r255, 7;
cvt.u64.u32 %rd2, %r256;
mov.u64 %rd44, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_103395arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r257, %r8, 7;
setp.lt.s32 %p7, %r257, %r203;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
// end inline asm
shl.b32 %r261, %r5, 4;
add.s32 %r259, %r258, %r261;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r260, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r260, 0;
cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r735, %r6, 4;
add.s32 %r262, %r4, 215;
div.s32 %r263, %r262, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r264, %r11, %r263;
add.s32 %r265, %r264, -1;
div.s32 %r12, %r265, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r203;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r267, %ctaid.y;
mul.lo.s32 %r268, %r12, %r4;
mul.lo.s32 %r13, %r268, %r267;
shl.b32 %r269, %r9, 1;
shl.b32 %r270, %r5, 4;
mad.lo.s32 %r14, %r269, %r203, %r270;
mul.lo.s32 %r271, %r203, %r9;
cvt.s64.s32 %rd53, %r271;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r272, %r13, %r203;
cvt.s64.s32 %rd6, %r272;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
add.s32 %r15, %r271, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r15, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r16, %r3, 5;
mul.lo.s32 %r275, %r274, %r16;
shr.u32 %r17, %r5, 5;
add.s32 %r276, %r275, %r17;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r18, %r5, 31;
add.s32 %r277, %r275, %r18;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r734, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r14, %r280;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r14, %r283;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r23, %r734, %r4;
add.s32 %r278, %r23, %r9;
add.s32 %r24, %r278, %r13;
setp.gt.s32 %p13, %r24, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r24, %r212;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r24, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r23, %r203;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r24, %r216;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ mov.b32 %f234, {0,%rs36};}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ mov.b32 %f235, {0,%rs37};}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ mov.b32 %f236, {0,%rs38};}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ mov.b32 %f237, {0,%rs39};}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ mov.b32 %f238, {0,%rs40};}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ mov.b32 %f239, {0,%rs41};}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ mov.b32 %f240, {0,%rs42};}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ mov.b32 %f241, {0,%rs43};}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ mov.b32 %f242, {0,%rs44};}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ mov.b32 %f243, {0,%rs45};}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ mov.b32 %f244, {0,%rs46};}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ mov.b32 %f245, {0,%rs47};}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ mov.b32 %f246, {0,%rs48};}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ mov.b32 %f247, {0,%rs49};}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ mov.b32 %f248, {0,%rs50};}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ mov.b32 %f249, {0,%rs51};}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ mov.b32 %f250, {0,%rs52};}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ mov.b32 %f251, {0,%rs53};}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ mov.b32 %f252, {0,%rs54};}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ mov.b32 %f253, {0,%rs55};}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ mov.b32 %f254, {0,%rs56};}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ mov.b32 %f255, {0,%rs57};}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ mov.b32 %f256, {0,%rs58};}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ mov.b32 %f257, {0,%rs59};}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r735, %r735, 2;
bar.sync 0;
setp.ne.s32 %p23, %r18, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r17, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r18, %r16;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r18, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r18, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r17, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r18, %r16;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r18, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ mov.b32 %f374, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ mov.b32 %f375, {0,%rs98};}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ mov.b32 %f376, {0,%rs99};}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ mov.b32 %f378, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f379, {0,%rs102};}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ mov.b32 %f380, {0,%rs103};}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ mov.b32 %f382, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ mov.b32 %f383, {0,%rs106};}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ mov.b32 %f384, {0,%rs107};}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ mov.b32 %f386, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f387, {0,%rs110};}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ mov.b32 %f388, {0,%rs111};}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ mov.b32 %f390, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ mov.b32 %f391, {0,%rs114};}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ mov.b32 %f392, {0,%rs115};}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ mov.b32 %f394, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f395, {0,%rs118};}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ mov.b32 %f396, {0,%rs119};}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ mov.b32 %f398, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ mov.b32 %f399, {0,%rs122};}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ mov.b32 %f400, {0,%rs123};}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ mov.b32 %f402, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f403, {0,%rs126};}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ mov.b32 %f404, {0,%rs127};}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
add.s32 %r416, %r13, %r732;
mad.lo.s32 %r417, %r416, %r203, %r15;
mul.wide.s32 %rd76, %r417, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r734, %r734, 1;
setp.lt.s32 %p49, %r734, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r418, %tid.z;
mad.lo.s32 %r46, %r418, %r4, %r9;
mad.lo.s32 %r47, %r46, %r3, %r5;
mul.wide.u32 %rd77, %r47, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r419, %r4;
mov.u32 %r420, 31;
sub.s32 %r48, %r420, %r419;
mov.u32 %r421, 1;
shl.b32 %r766, %r421, %r48;
setp.lt.u32 %p50, %r9, %r766;
add.s32 %r422, %r766, %r9;
setp.lt.u32 %p51, %r422, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r423, %r3, %r48;
add.s32 %r424, %r47, %r423;
mul.wide.s32 %rd79, %r424, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r766, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r736, %r766;
$L__BB0_40:
shr.u32 %r51, %r736, 1;
setp.ge.u32 %p54, %r9, %r51;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r425, %r51, %r3, %r47;
mul.wide.s32 %rd82, %r425, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r736, 7;
mov.u32 %r736, %r51;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r737, 0;
add.s32 %r427, %r47, %r3;
mul.wide.u32 %rd85, %r427, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r737, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r428, %r3, %r48;
add.s32 %r429, %r47, %r428;
mul.wide.s32 %rd87, %r429, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r738, %r766;
$L__BB0_51:
shr.u32 %r55, %r738, 1;
setp.ge.u32 %p60, %r9, %r55;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r430, %r55, %r3, %r47;
mul.wide.s32 %rd90, %r430, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r738, 7;
mov.u32 %r738, %r55;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r739, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r432, %r3, %r48;
add.s32 %r433, %r47, %r432;
mul.wide.s32 %rd93, %r433, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r740, %r766;
$L__BB0_62:
shr.u32 %r59, %r740, 1;
setp.ge.u32 %p66, %r9, %r59;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r434, %r59, %r3, %r47;
mul.wide.s32 %rd96, %r434, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r740, 7;
mov.u32 %r740, %r59;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r741, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r436, %r3, %r48;
add.s32 %r437, %r47, %r436;
mul.wide.s32 %rd99, %r437, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r742, %r766;
$L__BB0_73:
shr.u32 %r63, %r742, 1;
setp.ge.u32 %p72, %r9, %r63;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r438, %r63, %r3, %r47;
mul.wide.s32 %rd102, %r438, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r742, 7;
mov.u32 %r742, %r63;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r743, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r440, %r3, %r48;
add.s32 %r441, %r47, %r440;
mul.wide.s32 %rd105, %r441, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r744, %r766;
$L__BB0_84:
shr.u32 %r67, %r744, 1;
setp.ge.u32 %p78, %r9, %r67;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r442, %r67, %r3, %r47;
mul.wide.s32 %rd108, %r442, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r744, 7;
mov.u32 %r744, %r67;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r745, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r444, %r3, %r48;
add.s32 %r445, %r47, %r444;
mul.wide.s32 %rd111, %r445, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r746, %r766;
$L__BB0_95:
shr.u32 %r71, %r746, 1;
setp.ge.u32 %p84, %r9, %r71;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r446, %r71, %r3, %r47;
mul.wide.s32 %rd114, %r446, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r746, 7;
mov.u32 %r746, %r71;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r747, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r448, %r3, %r48;
add.s32 %r449, %r47, %r448;
mul.wide.s32 %rd117, %r449, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r748, %r766;
$L__BB0_106:
shr.u32 %r75, %r748, 1;
setp.ge.u32 %p90, %r9, %r75;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r450, %r75, %r3, %r47;
mul.wide.s32 %rd120, %r450, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r748, 7;
mov.u32 %r748, %r75;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r749, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r452, %r3, %r48;
add.s32 %r453, %r47, %r452;
mul.wide.s32 %rd123, %r453, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r750, %r766;
$L__BB0_117:
shr.u32 %r79, %r750, 1;
setp.ge.u32 %p96, %r9, %r79;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r454, %r79, %r3, %r47;
mul.wide.s32 %rd126, %r454, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r750, 7;
mov.u32 %r750, %r79;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r751, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r82, %r735, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r456, %r3, %r48;
add.s32 %r457, %r47, %r456;
mul.wide.s32 %rd129, %r457, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r752, %r766;
$L__BB0_128:
shr.u32 %r84, %r752, 1;
setp.ge.u32 %p102, %r9, %r84;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r458, %r84, %r3, %r47;
mul.wide.s32 %rd132, %r458, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r752, 7;
mov.u32 %r752, %r84;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r753, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r460, %r3, %r48;
add.s32 %r461, %r47, %r460;
mul.wide.s32 %rd135, %r461, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r754, %r766;
$L__BB0_139:
shr.u32 %r88, %r754, 1;
setp.ge.u32 %p108, %r9, %r88;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r462, %r88, %r3, %r47;
mul.wide.s32 %rd138, %r462, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r754, 7;
mov.u32 %r754, %r88;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r755, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r464, %r3, %r48;
add.s32 %r465, %r47, %r464;
mul.wide.s32 %rd141, %r465, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r756, %r766;
$L__BB0_150:
shr.u32 %r92, %r756, 1;
setp.ge.u32 %p114, %r9, %r92;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r466, %r92, %r3, %r47;
mul.wide.s32 %rd144, %r466, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r756, 7;
mov.u32 %r756, %r92;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r757, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r468, %r3, %r48;
add.s32 %r469, %r47, %r468;
mul.wide.s32 %rd147, %r469, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r758, %r766;
$L__BB0_161:
shr.u32 %r96, %r758, 1;
setp.ge.u32 %p120, %r9, %r96;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r470, %r96, %r3, %r47;
mul.wide.s32 %rd150, %r470, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r758, 7;
mov.u32 %r758, %r96;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r759, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r472, %r3, %r48;
add.s32 %r473, %r47, %r472;
mul.wide.s32 %rd153, %r473, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r760, %r766;
$L__BB0_172:
shr.u32 %r100, %r760, 1;
setp.ge.u32 %p126, %r9, %r100;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r474, %r100, %r3, %r47;
mul.wide.s32 %rd156, %r474, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r760, 7;
mov.u32 %r760, %r100;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r761, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r476, %r3, %r48;
add.s32 %r477, %r47, %r476;
mul.wide.s32 %rd159, %r477, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r762, %r766;
$L__BB0_183:
shr.u32 %r104, %r762, 1;
setp.ge.u32 %p132, %r9, %r104;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r478, %r104, %r3, %r47;
mul.wide.s32 %rd162, %r478, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r762, 7;
mov.u32 %r762, %r104;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r763, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r763, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r480, %r3, %r48;
add.s32 %r481, %r47, %r480;
mul.wide.s32 %rd165, %r481, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r764, %r766;
$L__BB0_194:
shr.u32 %r108, %r764, 1;
setp.ge.u32 %p138, %r9, %r108;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r482, %r108, %r3, %r47;
mul.wide.s32 %rd168, %r482, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r764, 7;
mov.u32 %r764, %r108;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r765, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r765, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r484, %r3, %r48;
add.s32 %r485, %r47, %r484;
mul.wide.s32 %rd171, %r485, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r112, %r766, 1;
setp.ge.u32 %p144, %r9, %r112;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r486, %r112, %r3, %r47;
mul.wide.s32 %rd174, %r486, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r766, 7;
mov.u32 %r766, %r112;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r767, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r767, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
shl.b32 %r731, %r5, 3;
mov.u32 %r512, %ctaid.y;
mad.lo.s32 %r513, %r203, %r512, %r731;
add.s32 %r514, %r513, %r82;
mul.wide.s32 %rd183, %r514, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
// end inline asm
add.s32 %r515, %r514, 4;
mul.wide.s32 %rd184, %r515, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r488, %r728, 3;
sub.s32 %r115, %r488, %r203;
mov.u32 %r489, %ctaid.y;
mad.lo.s32 %r116, %r203, %r489, %r728;
neg.s32 %r490, %r82;
setp.ge.s32 %p151, %r115, %r490;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r495, %r116, %r82;
mul.wide.s32 %rd178, %r495, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
// end inline asm
$L__BB0_214:
mov.u32 %r496, -4;
sub.s32 %r497, %r496, %r82;
setp.ge.s32 %p153, %r115, %r497;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r502, %r116, %r82;
add.s32 %r503, %r502, 4;
mul.wide.s32 %rd180, %r503, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r730, %r5, 3;
shl.b32 %r540, %r735, 5;
mov.u32 %r541, %ctaid.y;
mad.lo.s32 %r542, %r203, %r541, %r730;
add.s32 %r543, %r542, %r540;
mul.wide.s32 %rd191, %r543, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
// end inline asm
add.s32 %r544, %r543, 4;
mul.wide.s32 %rd192, %r544, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r516, %r729, 3;
sub.s32 %r117, %r516, %r203;
mov.u32 %r517, %ctaid.y;
mad.lo.s32 %r118, %r203, %r517, %r729;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r119, %r735, 5;
neg.s32 %r518, %r119;
setp.ge.s32 %p160, %r117, %r518;
@%p160 bra $L__BB0_222;
add.s32 %r523, %r118, %r119;
mul.wide.s32 %rd186, %r523, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r120, %r735, 5;
mov.u32 %r524, -4;
sub.s32 %r525, %r524, %r120;
setp.ge.s32 %p162, %r117, %r525;
@%p162 bra $L__BB0_227;
add.s32 %r530, %r118, %r120;
add.s32 %r531, %r530, 4;
mul.wide.s32 %rd188, %r531, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
// end inline asm
$L__BB0_227:
mov.u32 %r121, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r545, %r5, %r9;
or.b32 %r547, %r545, %r418;
setp.ne.s32 %p164, %r547, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r548, %ctaid.x;
mov.u32 %r549, %ctaid.z;
mov.u32 %r550, %nctaid.x;
mad.lo.s32 %r551, %r549, %r550, %r548;
mul.wide.s32 %rd194, %r551, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r552, %r11, -1;
setp.eq.s32 %p165, %r121, %r552;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r768, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r768;
// end inline asm
setp.lt.u32 %p167, %r768, 256;
selp.u32 %r555, 1, 0, %p167;
shl.b32 %r768, %r768, %r555;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_e501cd20_1033910nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r557, %r203, 1;
shr.u32 %r558, %r557, 31;
add.s32 %r559, %r557, %r558;
shr.s32 %r560, %r559, 1;
add.s32 %r561, %r4, %r560;
add.s32 %r562, %r561, -1;
div.s32 %r563, %r562, %r4;
add.s32 %r564, %r11, -1;
add.s32 %r565, %r564, %r563;
div.s32 %r124, %r565, %r11;
add.s32 %r125, %r564, %r3;
shl.b32 %r126, %r9, 1;
shl.b32 %r566, %r4, 1;
mad.lo.s32 %r129, %r566, %r121, %r126;
or.b32 %r127, %r129, 1;
mul.lo.s32 %r128, %r566, %r11;
shr.u32 %r130, %r3, 5;
mul.lo.s32 %r567, %r46, %r130;
shr.u32 %r131, %r5, 5;
add.s32 %r568, %r567, %r131;
mul.wide.u32 %rd203, %r568, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r132, %r5, 31;
add.s32 %r569, %r567, %r132;
mul.wide.u32 %rd205, %r569, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r769, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r769, %r769, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r769, %r124;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r154, %r125, %r3;
setp.lt.s32 %p206, %r154, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r650, %r128, %r769;
add.s32 %r155, %r127, %r650;
add.s32 %r156, %r129, %r650;
mov.u32 %r649, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r776, %r649;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r155, %r203;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r158, %r776, %r3, %r5;
setp.ge.s32 %p208, %r158, %r11;
mov.u32 %r777, %r649;
mov.u32 %r778, %r649;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r657, %r158, %r203, %r156;
mul.wide.s32 %rd211, %r657, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r778;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r777;
add.f32 %f769, %f769, %f643;
add.s32 %r776, %r776, 1;
setp.lt.s32 %p209, %r776, %r154;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r658, %f770;
mov.u32 %r659, 31;
mov.u32 %r660, 16;
mov.u32 %r661, -1;
shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
mov.b32 %f644, %r662;
add.f32 %f645, %f770, %f644;
mov.b32 %r663, %f645;
mov.u32 %r664, 8;
shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
mov.b32 %f646, %r665;
add.f32 %f647, %f645, %f646;
mov.b32 %r666, %f647;
mov.u32 %r667, 4;
shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
mov.b32 %f648, %r668;
add.f32 %f649, %f647, %f648;
mov.b32 %r669, %f649;
mov.u32 %r670, 2;
shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
mov.b32 %f650, %r671;
add.f32 %f651, %f649, %f650;
mov.b32 %r672, %f651;
mov.u32 %r673, 1;
shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
mov.b32 %f652, %r674;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r132, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r131, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r132, %r130;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r675, %f771;
mov.u32 %r676, 31;
mov.u32 %r677, 16;
mov.u32 %r678, -1;
shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
mov.b32 %f654, %r679;
add.f32 %f655, %f771, %f654;
mov.b32 %r680, %f655;
mov.u32 %r681, 8;
shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
mov.b32 %f656, %r682;
add.f32 %f657, %f655, %f656;
mov.b32 %r683, %f657;
mov.u32 %r684, 4;
shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
mov.b32 %f658, %r685;
add.f32 %f659, %f657, %f658;
mov.b32 %r686, %f659;
mov.u32 %r687, 2;
shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
mov.b32 %f660, %r688;
add.f32 %f661, %f659, %f660;
mov.b32 %r689, %f661;
mov.u32 %r690, 1;
shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
mov.b32 %f662, %r691;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r132, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r692, %f769;
mov.u32 %r693, 31;
mov.u32 %r694, 16;
mov.u32 %r695, -1;
shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
mov.b32 %f665, %r696;
add.f32 %f666, %f769, %f665;
mov.b32 %r697, %f666;
mov.u32 %r698, 8;
shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
mov.b32 %f667, %r699;
add.f32 %f668, %f666, %f667;
mov.b32 %r700, %f668;
mov.u32 %r701, 4;
shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
mov.b32 %f669, %r702;
add.f32 %f670, %f668, %f669;
mov.b32 %r703, %f670;
mov.u32 %r704, 2;
shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
mov.b32 %f671, %r705;
add.f32 %f672, %f670, %f671;
mov.b32 %r706, %f672;
mov.u32 %r707, 1;
shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
mov.b32 %f673, %r708;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r132, %r130;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r709, %f773;
mov.u32 %r710, 31;
mov.u32 %r711, 16;
mov.u32 %r712, -1;
shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
mov.b32 %f675, %r713;
add.f32 %f676, %f773, %f675;
mov.b32 %r714, %f676;
mov.u32 %r715, 8;
shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
mov.b32 %f677, %r716;
add.f32 %f678, %f676, %f677;
mov.b32 %r717, %f678;
mov.u32 %r718, 4;
shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
mov.b32 %f679, %r719;
add.f32 %f680, %f678, %f679;
mov.b32 %r720, %f680;
mov.u32 %r721, 2;
shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
mov.b32 %f681, %r722;
add.f32 %f682, %f680, %f681;
mov.b32 %r723, %f682;
mov.u32 %r724, 1;
shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
mov.b32 %f683, %r725;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r164, %r128, %r769;
add.s32 %r726, %r127, %r164;
setp.ge.s32 %p239, %r726, %r203;
@%p239 bra $L__BB0_279;
add.s32 %r727, %r129, %r164;
mul.wide.s32 %rd212, %r727, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r124, 1;
@%p170 bra $L__BB0_257;
div.s32 %r134, %r125, %r3;
mad.lo.s32 %r135, %r203, %r5, %r126;
shl.b32 %r136, %r121, 1;
shl.b32 %r137, %r11, 1;
mul.lo.s32 %r138, %r203, %r3;
mov.u32 %r770, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r134, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r140, %r128, %r770, %r127;
mad.lo.s32 %r572, %r137, %r770, %r136;
mad.lo.s32 %r772, %r4, %r572, %r135;
mov.u32 %r571, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r771, %r5;
mov.u32 %r773, %r571;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r140, %r203;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r771, %r11;
mov.u32 %r774, %r571;
mov.u32 %r775, %r571;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r772, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r775;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r774;
add.f32 %f761, %f761, %f595;
add.s32 %r772, %r772, %r138;
add.s32 %r771, %r771, %r3;
add.s32 %r773, %r773, 1;
setp.lt.s32 %p174, %r773, %r134;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r579, %f762;
mov.u32 %r580, 31;
mov.u32 %r581, 16;
mov.u32 %r582, -1;
shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
mov.b32 %f596, %r583;
add.f32 %f597, %f762, %f596;
mov.b32 %r584, %f597;
mov.u32 %r585, 8;
shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
mov.b32 %f598, %r586;
add.f32 %f599, %f597, %f598;
mov.b32 %r587, %f599;
mov.u32 %r588, 4;
shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
mov.b32 %f600, %r589;
add.f32 %f601, %f599, %f600;
mov.b32 %r590, %f601;
mov.u32 %r591, 2;
shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
mov.b32 %f602, %r592;
add.f32 %f603, %f601, %f602;
mov.b32 %r593, %f603;
mov.u32 %r594, 1;
shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
mov.b32 %f604, %r595;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r132, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r131, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r132, %r130;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r596, %f763;
mov.u32 %r597, 31;
mov.u32 %r598, 16;
mov.u32 %r599, -1;
shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
mov.b32 %f606, %r600;
add.f32 %f607, %f763, %f606;
mov.b32 %r601, %f607;
mov.u32 %r602, 8;
shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
mov.b32 %f608, %r603;
add.f32 %f609, %f607, %f608;
mov.b32 %r604, %f609;
mov.u32 %r605, 4;
shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
mov.b32 %f610, %r606;
add.f32 %f611, %f609, %f610;
mov.b32 %r607, %f611;
mov.u32 %r608, 2;
shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
mov.b32 %f612, %r609;
add.f32 %f613, %f611, %f612;
mov.b32 %r610, %f613;
mov.u32 %r611, 1;
shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
mov.b32 %f614, %r612;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r132, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r613, %f761;
mov.u32 %r614, 31;
mov.u32 %r615, 16;
mov.u32 %r616, -1;
shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
mov.b32 %f617, %r617;
add.f32 %f618, %f761, %f617;
mov.b32 %r618, %f618;
mov.u32 %r619, 8;
shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
mov.b32 %f619, %r620;
add.f32 %f620, %f618, %f619;
mov.b32 %r621, %f620;
mov.u32 %r622, 4;
shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
mov.b32 %f621, %r623;
add.f32 %f622, %f620, %f621;
mov.b32 %r624, %f622;
mov.u32 %r625, 2;
shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
mov.b32 %f623, %r626;
add.f32 %f624, %f622, %f623;
mov.b32 %r627, %f624;
mov.u32 %r628, 1;
shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
mov.b32 %f625, %r629;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r132, %r130;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r630, %f765;
mov.u32 %r631, 31;
mov.u32 %r632, 16;
mov.u32 %r633, -1;
shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
mov.b32 %f627, %r634;
add.f32 %f628, %f765, %f627;
mov.b32 %r635, %f628;
mov.u32 %r636, 8;
shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
mov.b32 %f629, %r637;
add.f32 %f630, %f628, %f629;
mov.b32 %r638, %f630;
mov.u32 %r639, 4;
shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
mov.b32 %f631, %r640;
add.f32 %f632, %f630, %f631;
mov.b32 %r641, %f632;
mov.u32 %r642, 2;
shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
mov.b32 %f633, %r643;
add.f32 %f634, %f632, %f633;
mov.b32 %r644, %f634;
mov.u32 %r645, 1;
shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
mov.b32 %f635, %r646;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r152, %r128, %r770;
add.s32 %r647, %r127, %r152;
setp.ge.s32 %p204, %r647, %r203;
@%p204 bra $L__BB0_256;
add.s32 %r648, %r129, %r152;
mul.wide.s32 %rd208, %r648, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r770, %r770, 1;
setp.lt.s32 %p205, %r770, %r124;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
.reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r201, %r202}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r211, %r212}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r215, %r216}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r237, %r202, 7;
shr.s32 %r238, %r237, 31;
shr.u32 %r239, %r238, 29;
add.s32 %r240, %r237, %r239;
shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
mov.u32 %r241, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
mul.lo.s32 %r243, %r4, %r2;
shl.b32 %r244, %r243, 4;
or.b32 %r245, %r244, 15;
and.b32 %r7, %r245, -16;
add.s32 %r246, %r245, %r7;
and.b32 %r247, %r246, -16;
cvt.s64.s32 %rd1, %r247;
max.s32 %r248, %r2, %r3;
add.s32 %r249, %r248, 31;
shr.s32 %r250, %r249, 31;
shr.u32 %r251, %r250, 27;
add.s32 %r252, %r249, %r251;
shr.u32 %r253, %r252, 5;
mul.lo.s32 %r254, %r4, %r253;
shl.b32 %r255, %r254, 7;
cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_72335arrayE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
or.b32 %r256, %r8, 7;
setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
// end inline asm
shl.b32 %r260, %r5, 4;
add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
mov.u32 %r259, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r259, 0;
cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
// end inline asm
$L__BB0_4:
bar.sync 0;
shl.b32 %r729, %r6, 4;
add.s32 %r261, %r4, 215;
div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
add.s32 %r263, %r11, %r262;
add.s32 %r264, %r263, -1;
div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
mov.u32 %r266, %ctaid.y;
mul.lo.s32 %r267, %r12, %r4;
mul.lo.s32 %r13, %r267, %r266;
mad.lo.s32 %r268, %r2, %r9, %r5;
shl.b32 %r14, %r268, 4;
mul.lo.s32 %r269, %r202, %r9;
cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
mul.lo.s32 %r270, %r13, %r202;
cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
shl.b32 %r271, %r9, 3;
mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
shr.u32 %r15, %r3, 5;
mul.lo.s32 %r275, %r274, %r15;
shr.u32 %r16, %r5, 5;
add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
and.b32 %r17, %r5, 31;
add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
// end inline asm
add.s32 %r281, %r280, %r14;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
// end inline asm
add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
mov.f32 %f692, %f200;
mov.f32 %f693, %f200;
mov.f32 %f694, %f200;
mov.f32 %f695, %f200;
mov.f32 %f696, %f200;
mov.f32 %f697, %f200;
mov.f32 %f698, %f200;
mov.f32 %f699, %f200;
mov.f32 %f700, %f200;
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
mul.lo.s32 %r22, %r728, %r4;
add.s32 %r278, %r22, %r9;
add.s32 %r23, %r278, %r13;
setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
mov.u32 %r285, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r281], [%rd66], 16, p0;
}
// end inline asm
add.s64 %rd68, %rd32, %rd72;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r285, 0;
cp.async.ca.shared.global [%r284], [%rd68], 16, p0;
}
// end inline asm
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
// begin inline asm
cp.async.wait_all;
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs7, %f202;}
// end inline asm
@%p2 bra $L__BB0_15;
bra.uni $L__BB0_14;
$L__BB0_15:
ld.shared.v4.u32 {%r292, %r293, %r294, %r295}, [%rd7];
ld.shared.v4.u32 {%r300, %r301, %r302, %r303}, [%rd9];
ld.shared.v4.u32 {%r308, %r309, %r310, %r311}, [%rd11];
mov.b32 {%rs36, %rs39}, %r308;
// begin inline asm
{ mov.b32 %f234, {0,%rs36};}
// end inline asm
add.f32 %f694, %f234, %f694;
mov.b32 {%rs37, %rs40}, %r300;
// begin inline asm
{ mov.b32 %f235, {0,%rs37};}
// end inline asm
mul.f32 %f258, %f234, %f235;
add.f32 %f259, %f258, 0f00000000;
mov.b32 {%rs38, %rs41}, %r292;
// begin inline asm
{ mov.b32 %f236, {0,%rs38};}
// end inline asm
sub.f32 %f260, %f236, %f703;
mul.f32 %f261, %f704, %f260;
fma.rn.f32 %f702, %f234, %f261, %f702;
fma.rn.f32 %f262, %f258, %f261, 0f00000000;
// begin inline asm
{ mov.b32 %f237, {0,%rs39};}
// end inline asm
add.f32 %f693, %f237, %f693;
// begin inline asm
{ mov.b32 %f238, {0,%rs40};}
// end inline asm
mul.f32 %f263, %f237, %f238;
add.f32 %f264, %f259, %f263;
// begin inline asm
{ mov.b32 %f239, {0,%rs41};}
// end inline asm
sub.f32 %f265, %f239, %f703;
mul.f32 %f266, %f704, %f265;
fma.rn.f32 %f701, %f237, %f266, %f701;
fma.rn.f32 %f267, %f263, %f266, %f262;
mov.b32 {%rs42, %rs45}, %r309;
// begin inline asm
{ mov.b32 %f240, {0,%rs42};}
// end inline asm
add.f32 %f692, %f240, %f692;
mov.b32 {%rs43, %rs46}, %r301;
// begin inline asm
{ mov.b32 %f241, {0,%rs43};}
// end inline asm
mul.f32 %f268, %f240, %f241;
add.f32 %f269, %f264, %f268;
mov.b32 {%rs44, %rs47}, %r293;
// begin inline asm
{ mov.b32 %f242, {0,%rs44};}
// end inline asm
sub.f32 %f270, %f242, %f703;
mul.f32 %f271, %f704, %f270;
fma.rn.f32 %f700, %f240, %f271, %f700;
fma.rn.f32 %f272, %f268, %f271, %f267;
// begin inline asm
{ mov.b32 %f243, {0,%rs45};}
// end inline asm
add.f32 %f691, %f243, %f691;
// begin inline asm
{ mov.b32 %f244, {0,%rs46};}
// end inline asm
mul.f32 %f273, %f243, %f244;
add.f32 %f274, %f269, %f273;
// begin inline asm
{ mov.b32 %f245, {0,%rs47};}
// end inline asm
sub.f32 %f275, %f245, %f703;
mul.f32 %f276, %f704, %f275;
fma.rn.f32 %f699, %f243, %f276, %f699;
fma.rn.f32 %f277, %f273, %f276, %f272;
mov.b32 {%rs48, %rs51}, %r310;
// begin inline asm
{ mov.b32 %f246, {0,%rs48};}
// end inline asm
add.f32 %f690, %f246, %f690;
mov.b32 {%rs49, %rs52}, %r302;
// begin inline asm
{ mov.b32 %f247, {0,%rs49};}
// end inline asm
mul.f32 %f278, %f246, %f247;
add.f32 %f279, %f274, %f278;
mov.b32 {%rs50, %rs53}, %r294;
// begin inline asm
{ mov.b32 %f248, {0,%rs50};}
// end inline asm
sub.f32 %f280, %f248, %f703;
mul.f32 %f281, %f704, %f280;
fma.rn.f32 %f698, %f246, %f281, %f698;
fma.rn.f32 %f282, %f278, %f281, %f277;
// begin inline asm
{ mov.b32 %f249, {0,%rs51};}
// end inline asm
add.f32 %f689, %f249, %f689;
// begin inline asm
{ mov.b32 %f250, {0,%rs52};}
// end inline asm
mul.f32 %f283, %f249, %f250;
add.f32 %f284, %f279, %f283;
// begin inline asm
{ mov.b32 %f251, {0,%rs53};}
// end inline asm
sub.f32 %f285, %f251, %f703;
mul.f32 %f286, %f704, %f285;
fma.rn.f32 %f697, %f249, %f286, %f697;
fma.rn.f32 %f287, %f283, %f286, %f282;
mov.b32 {%rs54, %rs57}, %r311;
// begin inline asm
{ mov.b32 %f252, {0,%rs54};}
// end inline asm
add.f32 %f688, %f252, %f688;
mov.b32 {%rs55, %rs58}, %r303;
// begin inline asm
{ mov.b32 %f253, {0,%rs55};}
// end inline asm
mul.f32 %f288, %f252, %f253;
add.f32 %f289, %f284, %f288;
mov.b32 {%rs56, %rs59}, %r295;
// begin inline asm
{ mov.b32 %f254, {0,%rs56};}
// end inline asm
sub.f32 %f290, %f254, %f703;
mul.f32 %f291, %f704, %f290;
fma.rn.f32 %f696, %f252, %f291, %f696;
fma.rn.f32 %f292, %f288, %f291, %f287;
// begin inline asm
{ mov.b32 %f255, {0,%rs57};}
// end inline asm
add.f32 %f687, %f255, %f687;
// begin inline asm
{ mov.b32 %f256, {0,%rs58};}
// end inline asm
mul.f32 %f293, %f255, %f256;
add.f32 %f722, %f289, %f293;
// begin inline asm
{ mov.b32 %f257, {0,%rs59};}
// end inline asm
sub.f32 %f294, %f257, %f703;
mul.f32 %f295, %f704, %f294;
fma.rn.f32 %f695, %f255, %f295, %f695;
fma.rn.f32 %f721, %f293, %f295, %f292;
bra.uni $L__BB0_16;
$L__BB0_14:
mov.f32 %f721, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs8, %f721;}
// end inline asm
mov.f32 %f722, %f721;
$L__BB0_16:
mov.b32 %r316, %f722;
mov.u32 %r317, 31;
mov.u32 %r318, 16;
mov.u32 %r319, -1;
shfl.sync.bfly.b32 %r320|%p18, %r316, %r318, %r317, %r319;
mov.b32 %f296, %r320;
add.f32 %f297, %f722, %f296;
mov.b32 %r321, %f297;
mov.u32 %r322, 8;
shfl.sync.bfly.b32 %r323|%p19, %r321, %r322, %r317, %r319;
mov.b32 %f298, %r323;
add.f32 %f299, %f297, %f298;
mov.b32 %r324, %f299;
mov.u32 %r325, 4;
shfl.sync.bfly.b32 %r326|%p20, %r324, %r325, %r317, %r319;
mov.b32 %f300, %r326;
add.f32 %f301, %f299, %f300;
mov.b32 %r327, %f301;
mov.u32 %r328, 2;
shfl.sync.bfly.b32 %r329|%p21, %r327, %r328, %r317, %r319;
mov.b32 %f302, %r329;
add.f32 %f303, %f301, %f302;
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
shl.b32 %r729, %r729, 2;
bar.sync 0;
setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
$L__BB0_21:
mov.b32 %r333, %f723;
mov.u32 %r334, 31;
mov.u32 %r335, 16;
mov.u32 %r336, -1;
shfl.sync.bfly.b32 %r337|%p26, %r333, %r335, %r334, %r336;
mov.b32 %f306, %r337;
add.f32 %f307, %f723, %f306;
mov.b32 %r338, %f307;
mov.u32 %r339, 8;
shfl.sync.bfly.b32 %r340|%p27, %r338, %r339, %r334, %r336;
mov.b32 %f308, %r340;
add.f32 %f309, %f307, %f308;
mov.b32 %r341, %f309;
mov.u32 %r342, 4;
shfl.sync.bfly.b32 %r343|%p28, %r341, %r342, %r334, %r336;
mov.b32 %f310, %r343;
add.f32 %f311, %f309, %f310;
mov.b32 %r344, %f311;
mov.u32 %r345, 2;
shfl.sync.bfly.b32 %r346|%p29, %r344, %r345, %r334, %r336;
mov.b32 %f312, %r346;
add.f32 %f313, %f311, %f312;
mov.b32 %r347, %f313;
mov.u32 %r348, 1;
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
shfl.sync.bfly.b32 %r354|%p31, %r350, %r352, %r351, %r353;
mov.b32 %f315, %r354;
add.f32 %f316, %f721, %f315;
mov.b32 %r355, %f316;
mov.u32 %r356, 8;
shfl.sync.bfly.b32 %r357|%p32, %r355, %r356, %r351, %r353;
mov.b32 %f317, %r357;
add.f32 %f318, %f316, %f317;
mov.b32 %r358, %f318;
mov.u32 %r359, 4;
shfl.sync.bfly.b32 %r360|%p33, %r358, %r359, %r351, %r353;
mov.b32 %f319, %r360;
add.f32 %f320, %f318, %f319;
mov.b32 %r361, %f320;
mov.u32 %r362, 2;
shfl.sync.bfly.b32 %r363|%p34, %r361, %r362, %r351, %r353;
mov.b32 %f321, %r363;
add.f32 %f322, %f320, %f321;
mov.b32 %r364, %f322;
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
$L__BB0_27:
mov.b32 %r367, %f725;
mov.u32 %r368, 31;
mov.u32 %r369, 16;
mov.u32 %r370, -1;
shfl.sync.bfly.b32 %r371|%p40, %r367, %r369, %r368, %r370;
mov.b32 %f326, %r371;
add.f32 %f327, %f725, %f326;
mov.b32 %r372, %f327;
mov.u32 %r373, 8;
shfl.sync.bfly.b32 %r374|%p41, %r372, %r373, %r368, %r370;
mov.b32 %f328, %r374;
add.f32 %f329, %f327, %f328;
mov.b32 %r375, %f329;
mov.u32 %r376, 4;
shfl.sync.bfly.b32 %r377|%p42, %r375, %r376, %r368, %r370;
mov.b32 %f330, %r377;
add.f32 %f331, %f329, %f330;
mov.b32 %r378, %f331;
mov.u32 %r379, 2;
shfl.sync.bfly.b32 %r380|%p43, %r378, %r379, %r368, %r370;
mov.b32 %f332, %r380;
add.f32 %f333, %f331, %f332;
mov.b32 %r381, %f333;
mov.u32 %r382, 1;
shfl.sync.bfly.b32 %r383|%p44, %r381, %r382, %r368, %r370;
mov.b32 %f334, %r383;
add.f32 %f726, %f333, %f334;
$L__BB0_28:
bar.sync 0;
@%p6 bra $L__BB0_30;
st.shared.f32 [%rd12], %f66;
$L__BB0_30:
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
bar.sync 0;
ld.shared.f32 %f72, [%rd12];
bar.sync 0;
mov.f32 %f337, 0f00000000;
// begin inline asm
{ cvt.rn.bf16.f32 %rs60, %f337;}
// end inline asm
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
// begin inline asm
{ mov.b32 %f374, {0,%rs97};}
// end inline asm
mov.b32 {%rs98, %rs102}, %r408;
// begin inline asm
{ mov.b32 %f375, {0,%rs98};}
// end inline asm
mul.f32 %f406, %f374, %f375;
mul.f32 %f407, %f406, %f2;
mov.b32 {%rs99, %rs103}, %r392;
// begin inline asm
{ mov.b32 %f376, {0,%rs99};}
// end inline asm
sub.f32 %f408, %f376, %f703;
mul.f32 %f409, %f704, %f408;
sub.f32 %f410, %f407, %f71;
mul.f32 %f411, %f72, %f409;
sub.f32 %f412, %f410, %f411;
mul.f32 %f377, %f686, %f412;
// begin inline asm
{ mov.b32 %f378, {0,%rs101};}
// end inline asm
// begin inline asm
{ mov.b32 %f379, {0,%rs102};}
// end inline asm
mul.f32 %f413, %f378, %f379;
mul.f32 %f414, %f413, %f2;
// begin inline asm
{ mov.b32 %f380, {0,%rs103};}
// end inline asm
sub.f32 %f415, %f380, %f703;
mul.f32 %f416, %f704, %f415;
sub.f32 %f417, %f414, %f71;
mul.f32 %f418, %f72, %f416;
sub.f32 %f419, %f417, %f418;
mul.f32 %f381, %f686, %f419;
// begin inline asm
{ cvt.rn.bf16.f32 %rs104, %f381;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs100, %f377;}
// end inline asm
mov.b32 %r388, {%rs100, %rs104};
mov.b32 {%rs105, %rs109}, %r401;
// begin inline asm
{ mov.b32 %f382, {0,%rs105};}
// end inline asm
mov.b32 {%rs106, %rs110}, %r409;
// begin inline asm
{ mov.b32 %f383, {0,%rs106};}
// end inline asm
mul.f32 %f420, %f382, %f383;
mul.f32 %f421, %f420, %f2;
mov.b32 {%rs107, %rs111}, %r393;
// begin inline asm
{ mov.b32 %f384, {0,%rs107};}
// end inline asm
sub.f32 %f422, %f384, %f703;
mul.f32 %f423, %f704, %f422;
sub.f32 %f424, %f421, %f71;
mul.f32 %f425, %f72, %f423;
sub.f32 %f426, %f424, %f425;
mul.f32 %f385, %f686, %f426;
// begin inline asm
{ mov.b32 %f386, {0,%rs109};}
// end inline asm
// begin inline asm
{ mov.b32 %f387, {0,%rs110};}
// end inline asm
mul.f32 %f427, %f386, %f387;
mul.f32 %f428, %f427, %f2;
// begin inline asm
{ mov.b32 %f388, {0,%rs111};}
// end inline asm
sub.f32 %f429, %f388, %f703;
mul.f32 %f430, %f704, %f429;
sub.f32 %f431, %f428, %f71;
mul.f32 %f432, %f72, %f430;
sub.f32 %f433, %f431, %f432;
mul.f32 %f389, %f686, %f433;
// begin inline asm
{ cvt.rn.bf16.f32 %rs112, %f389;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs108, %f385;}
// end inline asm
mov.b32 %r389, {%rs108, %rs112};
mov.b32 {%rs113, %rs117}, %r402;
// begin inline asm
{ mov.b32 %f390, {0,%rs113};}
// end inline asm
mov.b32 {%rs114, %rs118}, %r410;
// begin inline asm
{ mov.b32 %f391, {0,%rs114};}
// end inline asm
mul.f32 %f434, %f390, %f391;
mul.f32 %f435, %f434, %f2;
mov.b32 {%rs115, %rs119}, %r394;
// begin inline asm
{ mov.b32 %f392, {0,%rs115};}
// end inline asm
sub.f32 %f436, %f392, %f703;
mul.f32 %f437, %f704, %f436;
sub.f32 %f438, %f435, %f71;
mul.f32 %f439, %f72, %f437;
sub.f32 %f440, %f438, %f439;
mul.f32 %f393, %f686, %f440;
// begin inline asm
{ mov.b32 %f394, {0,%rs117};}
// end inline asm
// begin inline asm
{ mov.b32 %f395, {0,%rs118};}
// end inline asm
mul.f32 %f441, %f394, %f395;
mul.f32 %f442, %f441, %f2;
// begin inline asm
{ mov.b32 %f396, {0,%rs119};}
// end inline asm
sub.f32 %f443, %f396, %f703;
mul.f32 %f444, %f704, %f443;
sub.f32 %f445, %f442, %f71;
mul.f32 %f446, %f72, %f444;
sub.f32 %f447, %f445, %f446;
mul.f32 %f397, %f686, %f447;
// begin inline asm
{ cvt.rn.bf16.f32 %rs120, %f397;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs116, %f393;}
// end inline asm
mov.b32 %r390, {%rs116, %rs120};
mov.b32 {%rs121, %rs125}, %r403;
// begin inline asm
{ mov.b32 %f398, {0,%rs121};}
// end inline asm
mov.b32 {%rs122, %rs126}, %r411;
// begin inline asm
{ mov.b32 %f399, {0,%rs122};}
// end inline asm
mul.f32 %f448, %f398, %f399;
mul.f32 %f449, %f448, %f2;
mov.b32 {%rs123, %rs127}, %r395;
// begin inline asm
{ mov.b32 %f400, {0,%rs123};}
// end inline asm
sub.f32 %f450, %f400, %f703;
mul.f32 %f451, %f704, %f450;
sub.f32 %f452, %f449, %f71;
mul.f32 %f453, %f72, %f451;
sub.f32 %f454, %f452, %f453;
mul.f32 %f401, %f686, %f454;
// begin inline asm
{ mov.b32 %f402, {0,%rs125};}
// end inline asm
// begin inline asm
{ mov.b32 %f403, {0,%rs126};}
// end inline asm
mul.f32 %f455, %f402, %f403;
mul.f32 %f456, %f455, %f2;
// begin inline asm
{ mov.b32 %f404, {0,%rs127};}
// end inline asm
sub.f32 %f457, %f404, %f703;
mul.f32 %f458, %f704, %f457;
sub.f32 %f459, %f456, %f71;
mul.f32 %f460, %f72, %f458;
sub.f32 %f461, %f459, %f460;
mul.f32 %f405, %f686, %f461;
// begin inline asm
{ cvt.rn.bf16.f32 %rs128, %f405;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs124, %f401;}
// end inline asm
mov.b32 %r391, {%rs124, %rs128};
mad.lo.s32 %r416, %r23, %r202, %r8;
mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
// begin inline asm
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
// end inline asm
bra.uni $L__BB0_35;
$L__BB0_33:
// begin inline asm
{ cvt.rn.bf16.f32 %rs61, %f337;}
// end inline asm
$L__BB0_35:
add.s32 %r728, %r728, 1;
setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
mov.f32 %f688, %f687;
mov.f32 %f689, %f687;
mov.f32 %f690, %f687;
mov.f32 %f691, %f687;
mov.f32 %f692, %f687;
mov.f32 %f693, %f687;
mov.f32 %f694, %f687;
mov.f32 %f695, %f687;
mov.f32 %f696, %f687;
mov.f32 %f697, %f687;
mov.f32 %f698, %f687;
mov.f32 %f699, %f687;
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
mov.u32 %r417, %tid.z;
mad.lo.s32 %r45, %r417, %r4, %r9;
mad.lo.s32 %r46, %r45, %r3, %r5;
mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
clz.b32 %r418, %r4;
mov.u32 %r419, 31;
sub.s32 %r47, %r419, %r418;
mov.u32 %r420, 1;
shl.b32 %r760, %r420, %r47;
setp.lt.u32 %p50, %r9, %r760;
add.s32 %r421, %r760, %r9;
setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
shl.b32 %r422, %r3, %r47;
add.s32 %r423, %r46, %r422;
mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
mov.u32 %r730, %r760;
$L__BB0_40:
shr.u32 %r50, %r730, 1;
setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
mad.lo.s32 %r424, %r50, %r3, %r46;
mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
setp.gt.u32 %p55, %r730, 7;
mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
mov.u32 %r731, 0;
add.s32 %r426, %r46, %r3;
mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
setp.lt.u32 %p57, %r4, 2;
@%p57 bra $L__BB0_46;
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
shl.b32 %r427, %r3, %r47;
add.s32 %r428, %r46, %r427;
mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
mov.u32 %r732, %r760;
$L__BB0_51:
shr.u32 %r54, %r732, 1;
setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
mad.lo.s32 %r429, %r54, %r3, %r46;
mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
setp.gt.u32 %p61, %r732, 7;
mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@%p63 bra $L__BB0_57;
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
shl.b32 %r431, %r3, %r47;
add.s32 %r432, %r46, %r431;
mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
mov.u32 %r734, %r760;
$L__BB0_62:
shr.u32 %r58, %r734, 1;
setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
mad.lo.s32 %r433, %r58, %r3, %r46;
mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
setp.gt.u32 %p67, %r734, 7;
mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@%p69 bra $L__BB0_68;
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
shl.b32 %r435, %r3, %r47;
add.s32 %r436, %r46, %r435;
mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
mov.u32 %r736, %r760;
$L__BB0_73:
shr.u32 %r62, %r736, 1;
setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
mad.lo.s32 %r437, %r62, %r3, %r46;
mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
setp.gt.u32 %p73, %r736, 7;
mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@%p75 bra $L__BB0_79;
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
shl.b32 %r439, %r3, %r47;
add.s32 %r440, %r46, %r439;
mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
mov.u32 %r738, %r760;
$L__BB0_84:
shr.u32 %r66, %r738, 1;
setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
mad.lo.s32 %r441, %r66, %r3, %r46;
mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
setp.gt.u32 %p79, %r738, 7;
mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@%p81 bra $L__BB0_90;
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
shl.b32 %r443, %r3, %r47;
add.s32 %r444, %r46, %r443;
mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
mov.u32 %r740, %r760;
$L__BB0_95:
shr.u32 %r70, %r740, 1;
setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
mad.lo.s32 %r445, %r70, %r3, %r46;
mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
setp.gt.u32 %p85, %r740, 7;
mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@%p87 bra $L__BB0_101;
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
shl.b32 %r447, %r3, %r47;
add.s32 %r448, %r46, %r447;
mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
mov.u32 %r742, %r760;
$L__BB0_106:
shr.u32 %r74, %r742, 1;
setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
mad.lo.s32 %r449, %r74, %r3, %r46;
mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
setp.gt.u32 %p91, %r742, 7;
mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@%p93 bra $L__BB0_112;
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
shl.b32 %r451, %r3, %r47;
add.s32 %r452, %r46, %r451;
mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
mov.u32 %r744, %r760;
$L__BB0_117:
shr.u32 %r78, %r744, 1;
setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
mad.lo.s32 %r453, %r78, %r3, %r46;
mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
setp.gt.u32 %p97, %r744, 7;
mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@%p99 bra $L__BB0_123;
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
shl.b32 %r455, %r3, %r47;
add.s32 %r456, %r46, %r455;
mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
mov.u32 %r746, %r760;
$L__BB0_128:
shr.u32 %r83, %r746, 1;
setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
mad.lo.s32 %r457, %r83, %r3, %r46;
mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
setp.gt.u32 %p103, %r746, 7;
mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@%p105 bra $L__BB0_134;
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
shl.b32 %r459, %r3, %r47;
add.s32 %r460, %r46, %r459;
mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
mov.u32 %r748, %r760;
$L__BB0_139:
shr.u32 %r87, %r748, 1;
setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
mad.lo.s32 %r461, %r87, %r3, %r46;
mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
setp.gt.u32 %p109, %r748, 7;
mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@%p111 bra $L__BB0_145;
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
shl.b32 %r463, %r3, %r47;
add.s32 %r464, %r46, %r463;
mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
mov.u32 %r750, %r760;
$L__BB0_150:
shr.u32 %r91, %r750, 1;
setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
mad.lo.s32 %r465, %r91, %r3, %r46;
mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
setp.gt.u32 %p115, %r750, 7;
mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@%p117 bra $L__BB0_156;
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
shl.b32 %r467, %r3, %r47;
add.s32 %r468, %r46, %r467;
mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
mov.u32 %r752, %r760;
$L__BB0_161:
shr.u32 %r95, %r752, 1;
setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
mad.lo.s32 %r469, %r95, %r3, %r46;
mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
setp.gt.u32 %p121, %r752, 7;
mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@%p123 bra $L__BB0_167;
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
shl.b32 %r471, %r3, %r47;
add.s32 %r472, %r46, %r471;
mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
mov.u32 %r754, %r760;
$L__BB0_172:
shr.u32 %r99, %r754, 1;
setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
mad.lo.s32 %r473, %r99, %r3, %r46;
mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
setp.gt.u32 %p127, %r754, 7;
mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@%p129 bra $L__BB0_178;
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
shl.b32 %r475, %r3, %r47;
add.s32 %r476, %r46, %r475;
mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
mov.u32 %r756, %r760;
$L__BB0_183:
shr.u32 %r103, %r756, 1;
setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
mad.lo.s32 %r477, %r103, %r3, %r46;
mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
setp.gt.u32 %p133, %r756, 7;
mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@%p135 bra $L__BB0_189;
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
shl.b32 %r479, %r3, %r47;
add.s32 %r480, %r46, %r479;
mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
mov.u32 %r758, %r760;
$L__BB0_194:
shr.u32 %r107, %r758, 1;
setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
mad.lo.s32 %r481, %r107, %r3, %r46;
mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
setp.gt.u32 %p139, %r758, 7;
mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@%p141 bra $L__BB0_200;
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
shl.b32 %r483, %r3, %r47;
add.s32 %r484, %r46, %r483;
mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
shr.u32 %r111, %r760, 1;
setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
mad.lo.s32 %r485, %r111, %r3, %r46;
mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
setp.gt.u32 %p145, %r760, 7;
mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@%p147 bra $L__BB0_210;
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
mov.u32 %r511, %ctaid.y;
mad.lo.s32 %r512, %r202, %r511, %r8;
add.s32 %r513, %r512, %r81;
mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
// begin inline asm
st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
// end inline asm
add.s32 %r514, %r513, 4;
mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
// begin inline asm
st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
// end inline asm
bra.uni $L__BB0_218;
$L__BB0_212:
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
add.s32 %r487, %r8, 3;
sub.s32 %r114, %r487, %r202;
mov.u32 %r488, %ctaid.y;
mad.lo.s32 %r115, %r202, %r488, %r8;
neg.s32 %r489, %r81;
setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
add.s32 %r494, %r115, %r81;
mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
// begin inline asm
st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
// end inline asm
$L__BB0_214:
mov.u32 %r495, -4;
sub.s32 %r496, %r495, %r81;
setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
add.s32 %r501, %r115, %r81;
add.s32 %r502, %r501, 4;
mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
// begin inline asm
st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
// end inline asm
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
shl.b32 %r539, %r729, 5;
mov.u32 %r540, %ctaid.y;
mad.lo.s32 %r541, %r202, %r540, %r8;
add.s32 %r542, %r541, %r539;
mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
// begin inline asm
st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
// end inline asm
add.s32 %r543, %r542, 4;
mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
// begin inline asm
st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
// end inline asm
bra.uni $L__BB0_227;
$L__BB0_219:
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
add.s32 %r515, %r8, 3;
sub.s32 %r116, %r515, %r202;
mov.u32 %r516, %ctaid.y;
mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
shl.b32 %r118, %r729, 5;
neg.s32 %r517, %r118;
setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
add.s32 %r522, %r117, %r118;
mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
// begin inline asm
st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
// end inline asm
$L__BB0_222:
@%p159 bra $L__BB0_227;
shl.b32 %r119, %r729, 5;
mov.u32 %r523, -4;
sub.s32 %r524, %r523, %r119;
setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
add.s32 %r529, %r117, %r119;
add.s32 %r530, %r529, 4;
mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
// begin inline asm
st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
// end inline asm
$L__BB0_227:
mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r544, %r5, %r9;
or.b32 %r546, %r544, %r417;
setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
mov.u32 %r547, %ctaid.x;
mov.u32 %r548, %ctaid.z;
mov.u32 %r549, %nctaid.x;
mad.lo.s32 %r550, %r548, %r549, %r547;
mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
add.s32 %r551, %r11, -1;
setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
mov.u32 %r762, 8;
$L__BB0_230:
// begin inline asm
nanosleep.u32 %r762;
// end inline asm
setp.lt.u32 %p167, %r762, 256;
selp.u32 %r554, 1, 0, %p167;
shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_45_cu_92df908f_723310nvfuser_45ENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
add.s32 %r556, %r202, 1;
shr.u32 %r557, %r556, 31;
add.s32 %r558, %r556, %r557;
shr.s32 %r559, %r558, 1;
add.s32 %r560, %r4, %r559;
add.s32 %r561, %r560, -1;
div.s32 %r562, %r561, %r4;
add.s32 %r563, %r11, -1;
add.s32 %r564, %r563, %r562;
div.s32 %r123, %r564, %r11;
add.s32 %r124, %r563, %r3;
shl.b32 %r125, %r9, 1;
shl.b32 %r565, %r4, 1;
mad.lo.s32 %r128, %r565, %r120, %r125;
or.b32 %r126, %r128, 1;
mul.lo.s32 %r127, %r565, %r11;
shr.u32 %r129, %r3, 5;
mul.lo.s32 %r566, %r45, %r129;
shr.u32 %r130, %r5, 5;
add.s32 %r567, %r566, %r130;
mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
and.b32 %r131, %r5, 31;
add.s32 %r568, %r566, %r131;
mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
div.s32 %r153, %r124, %r3;
setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
mul.lo.s32 %r649, %r127, %r763;
add.s32 %r154, %r126, %r649;
add.s32 %r155, %r128, %r649;
mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
setp.ge.s32 %p207, %r154, %r202;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
mad.lo.s32 %r157, %r770, %r3, %r5;
setp.ge.s32 %p208, %r157, %r11;
mov.u32 %r771, %r648;
mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
mad.lo.s32 %r656, %r157, %r202, %r155;
mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
// begin inline asm
ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
// end inline asm
$L__BB0_263:
mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
add.s32 %r770, %r770, 1;
setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
mov.b32 %r657, %f770;
mov.u32 %r658, 31;
mov.u32 %r659, 16;
mov.u32 %r660, -1;
shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
mov.b32 %r662, %f645;
mov.u32 %r663, 8;
shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
mov.b32 %r665, %f647;
mov.u32 %r666, 4;
shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
mov.b32 %r668, %f649;
mov.u32 %r669, 2;
shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
mov.b32 %r671, %f651;
mov.u32 %r672, 1;
shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
mov.b32 %r674, %f771;
mov.u32 %r675, 31;
mov.u32 %r676, 16;
mov.u32 %r677, -1;
shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
mov.b32 %r679, %f655;
mov.u32 %r680, 8;
shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
mov.b32 %r682, %f657;
mov.u32 %r683, 4;
shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
mov.b32 %r685, %f659;
mov.u32 %r686, 2;
shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
mov.b32 %r688, %f661;
mov.u32 %r689, 1;
shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs131, %f663;}
// end inline asm
mov.b32 %r691, %f769;
mov.u32 %r692, 31;
mov.u32 %r693, 16;
mov.u32 %r694, -1;
shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
mov.b32 %r696, %f666;
mov.u32 %r697, 8;
shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
mov.b32 %r699, %f668;
mov.u32 %r700, 4;
shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
mov.b32 %r702, %f670;
mov.u32 %r703, 2;
shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
mov.b32 %r705, %f672;
mov.u32 %r706, 1;
shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
mov.b32 %r708, %f773;
mov.u32 %r709, 31;
mov.u32 %r710, 16;
mov.u32 %r711, -1;
shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
mov.b32 %r713, %f676;
mov.u32 %r714, 8;
shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
mov.b32 %r716, %f678;
mov.u32 %r717, 4;
shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
mov.b32 %r719, %f680;
mov.u32 %r720, 2;
shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
mov.b32 %r722, %f682;
mov.u32 %r723, 1;
shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs132, %f684;}
// end inline asm
@%p6 bra $L__BB0_279;
mul.lo.s32 %r163, %r127, %r763;
add.s32 %r725, %r126, %r163;
setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
add.s32 %r726, %r128, %r163;
mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
div.s32 %r133, %r124, %r3;
mad.lo.s32 %r134, %r202, %r5, %r125;
shl.b32 %r135, %r120, 1;
shl.b32 %r136, %r11, 1;
mul.lo.s32 %r137, %r202, %r3;
mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
mad.lo.s32 %r139, %r127, %r764, %r126;
mad.lo.s32 %r571, %r136, %r764, %r135;
mad.lo.s32 %r766, %r4, %r571, %r134;
mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
mov.u32 %r765, %r5;
mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
setp.ge.s32 %p172, %r139, %r202;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
setp.ge.s32 %p173, %r765, %r11;
mov.u32 %r768, %r570;
mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
// begin inline asm
ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
// end inline asm
$L__BB0_240:
mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
add.s32 %r766, %r766, %r137;
add.s32 %r765, %r765, %r3;
add.s32 %r767, %r767, 1;
setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
mov.b32 %r578, %f762;
mov.u32 %r579, 31;
mov.u32 %r580, 16;
mov.u32 %r581, -1;
shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
mov.b32 %r583, %f597;
mov.u32 %r584, 8;
shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
mov.b32 %r586, %f599;
mov.u32 %r587, 4;
shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
mov.b32 %r589, %f601;
mov.u32 %r590, 2;
shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
mov.b32 %r592, %f603;
mov.u32 %r593, 1;
shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
mov.b32 %r595, %f763;
mov.u32 %r596, 31;
mov.u32 %r597, 16;
mov.u32 %r598, -1;
shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
mov.b32 %r600, %f607;
mov.u32 %r601, 8;
shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
mov.b32 %r603, %f609;
mov.u32 %r604, 4;
shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
mov.b32 %r606, %f611;
mov.u32 %r607, 2;
shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
mov.b32 %r609, %f613;
mov.u32 %r610, 1;
shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs129, %f615;}
// end inline asm
mov.b32 %r612, %f761;
mov.u32 %r613, 31;
mov.u32 %r614, 16;
mov.u32 %r615, -1;
shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
mov.b32 %r617, %f618;
mov.u32 %r618, 8;
shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
mov.b32 %r620, %f620;
mov.u32 %r621, 4;
shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
mov.b32 %r623, %f622;
mov.u32 %r624, 2;
shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
mov.b32 %r626, %f624;
mov.u32 %r627, 1;
shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
mov.b32 %r629, %f765;
mov.u32 %r630, 31;
mov.u32 %r631, 16;
mov.u32 %r632, -1;
shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
mov.b32 %r634, %f628;
mov.u32 %r635, 8;
shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
mov.b32 %r637, %f630;
mov.u32 %r638, 4;
shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
mov.b32 %r640, %f632;
mov.u32 %r641, 2;
shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
mov.b32 %r643, %f634;
mov.u32 %r644, 1;
shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
bar.sync 0;
// begin inline asm
{ cvt.rn.bf16.f32 %rs130, %f636;}
// end inline asm
@%p6 bra $L__BB0_256;
mul.lo.s32 %r151, %r127, %r764;
add.s32 %r646, %r126, %r151;
setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
add.s32 %r647, %r128, %r151;
mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
add.s32 %r764, %r764, 1;
setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -32,166 +32,166 @@
)
{
.reg .pred %p<243>;
.reg .b16 %rs<133>;
.reg .f32 %f<775>;
- .reg .b32 %r<779>;
+ .reg .b32 %r<773>;
.reg .f64 %fd<3>;
.reg .b64 %rd<217>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- ld.param.v2.u32 {%r202, %r203}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
- ld.param.v2.u32 {%r212, %r213}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
- ld.param.v2.u32 {%r216, %r217}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
+ ld.param.v2.u32 {%r201, %r202}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0+8];
+ ld.param.v2.u32 {%r211, %r212}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2+16];
+ ld.param.v2.u32 {%r215, %r216}, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd41, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd37, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd36, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_0];
- add.s32 %r238, %r203, 7;
- shr.s32 %r239, %r238, 31;
- shr.u32 %r240, %r239, 29;
- add.s32 %r241, %r238, %r240;
- shr.s32 %r2, %r241, 3;
+ add.s32 %r237, %r202, 7;
+ shr.s32 %r238, %r237, 31;
+ shr.u32 %r239, %r238, 29;
+ add.s32 %r240, %r237, %r239;
+ shr.s32 %r2, %r240, 3;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mov.u32 %r5, %tid.x;
setp.ne.s32 %p6, %r5, 0;
@%p6 bra $L__BB0_2;
- mov.u32 %r242, 0;
- st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r242;
+ mov.u32 %r241, 0;
+ st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s], %r241;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd43, _ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r243, [%rd43], %r5;
+ atom.shared.min.s32 %r242, [%rd43], %r5;
ld.shared.u32 %r6, [_ZZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEEE14nvfuser_zero_s];
- mul.lo.s32 %r244, %r4, %r2;
- shl.b32 %r245, %r244, 4;
- or.b32 %r246, %r245, 15;
- and.b32 %r7, %r246, -16;
- add.s32 %r247, %r246, %r7;
- and.b32 %r248, %r247, -16;
- cvt.s64.s32 %rd1, %r248;
- max.s32 %r249, %r2, %r3;
- add.s32 %r250, %r249, 31;
- shr.s32 %r251, %r250, 31;
- shr.u32 %r252, %r251, 27;
- add.s32 %r253, %r250, %r252;
- shr.u32 %r254, %r253, 5;
- mul.lo.s32 %r255, %r4, %r254;
- shl.b32 %r256, %r255, 7;
- cvt.u64.u32 %rd2, %r256;
+ mul.lo.s32 %r243, %r4, %r2;
+ shl.b32 %r244, %r243, 4;
+ or.b32 %r245, %r244, 15;
+ and.b32 %r7, %r245, -16;
+ add.s32 %r246, %r245, %r7;
+ and.b32 %r247, %r246, -16;
+ cvt.s64.s32 %rd1, %r247;
+ max.s32 %r248, %r2, %r3;
+ add.s32 %r249, %r248, 31;
+ shr.s32 %r250, %r249, 31;
+ shr.u32 %r251, %r250, 27;
+ add.s32 %r252, %r249, %r251;
+ shr.u32 %r253, %r252, 5;
+ mul.lo.s32 %r254, %r4, %r253;
+ shl.b32 %r255, %r254, 7;
+ cvt.u64.u32 %rd2, %r255;
mov.u64 %rd44, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd45, %rd44;
add.s64 %rd3, %rd45, %rd2;
shl.b32 %r8, %r5, 3;
- or.b32 %r257, %r8, 7;
- setp.lt.s32 %p7, %r257, %r203;
+ or.b32 %r256, %r8, 7;
+ setp.lt.s32 %p7, %r256, %r202;
setp.lt.s32 %p8, %r5, %r2;
and.pred %p1, %p7, %p8;
not.pred %p9, %p1;
mov.u32 %r9, %tid.y;
setp.ne.s32 %p10, %r9, 0;
or.pred %p11, %p10, %p9;
@%p11 bra $L__BB0_4;
add.s64 %rd46, %rd3, %rd1;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r258, smem_ptr; }
-
-
- shl.b32 %r261, %r5, 4;
- add.s32 %r259, %r258, %r261;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd46; cvt.u32.u64 %r257, smem_ptr; }
+
+
+ shl.b32 %r260, %r5, 4;
+ add.s32 %r258, %r257, %r260;
mul.wide.s32 %rd48, %r8, 2;
add.s64 %rd47, %rd36, %rd48;
- mov.u32 %r260, 0;
+ mov.u32 %r259, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r260, 0;
- cp.async.ca.shared.global [%r259], [%rd47], 16, p0;
+ setp.ne.b32 p0, %r259, 0;
+ cp.async.ca.shared.global [%r258], [%rd47], 16, p0;
}
$L__BB0_4:
bar.sync 0;
- shl.b32 %r735, %r6, 4;
- add.s32 %r262, %r4, 215;
- div.s32 %r263, %r262, %r4;
+ shl.b32 %r729, %r6, 4;
+ add.s32 %r261, %r4, 215;
+ div.s32 %r262, %r261, %r4;
mov.u32 %r11, %nctaid.y;
- add.s32 %r264, %r11, %r263;
- add.s32 %r265, %r264, -1;
- div.s32 %r12, %r265, %r11;
+ add.s32 %r263, %r11, %r262;
+ add.s32 %r264, %r263, -1;
+ div.s32 %r12, %r264, %r11;
setp.gt.s32 %p12, %r12, 0;
add.s64 %rd4, %rd1, %rd2;
@%p12 bra $L__BB0_6;
bra.uni $L__BB0_5;
$L__BB0_6:
- cvt.rn.f64.s32 %fd1, %r203;
+ cvt.rn.f64.s32 %fd1, %r202;
cvt.s64.s32 %rd49, %r7;
add.s64 %rd50, %rd49, %rd2;
add.s64 %rd52, %rd44, %rd2;
- mov.u32 %r267, %ctaid.y;
- mul.lo.s32 %r268, %r12, %r4;
- mul.lo.s32 %r13, %r268, %r267;
- shl.b32 %r269, %r9, 1;
- shl.b32 %r270, %r5, 4;
- mad.lo.s32 %r14, %r269, %r203, %r270;
- mul.lo.s32 %r271, %r203, %r9;
- cvt.s64.s32 %rd53, %r271;
+ mov.u32 %r266, %ctaid.y;
+ mul.lo.s32 %r267, %r12, %r4;
+ mul.lo.s32 %r13, %r267, %r266;
+ mad.lo.s32 %r268, %r2, %r9, %r5;
+ shl.b32 %r14, %r268, 4;
+ mul.lo.s32 %r269, %r202, %r9;
+ cvt.s64.s32 %rd53, %r269;
cvt.s64.s32 %rd54, %r8;
add.s64 %rd5, %rd53, %rd54;
- mul.lo.s32 %r272, %r13, %r203;
- cvt.s64.s32 %rd6, %r272;
+ mul.lo.s32 %r270, %r13, %r202;
+ cvt.s64.s32 %rd6, %r270;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
- add.s32 %r15, %r271, %r8;
+ shl.b32 %r271, %r9, 3;
+ mad.lo.s32 %r272, %r271, %r2, %r8;
add.s64 %rd55, %rd44, %rd50;
- mul.wide.s32 %rd56, %r15, 2;
+ mul.wide.s32 %rd56, %r272, 2;
add.s64 %rd7, %rd55, %rd56;
mov.u32 %r273, %tid.z;
mad.lo.s32 %r274, %r273, %r4, %r9;
- shr.u32 %r16, %r3, 5;
- mul.lo.s32 %r275, %r274, %r16;
- shr.u32 %r17, %r5, 5;
- add.s32 %r276, %r275, %r17;
+ shr.u32 %r15, %r3, 5;
+ mul.lo.s32 %r275, %r274, %r15;
+ shr.u32 %r16, %r5, 5;
+ add.s32 %r276, %r275, %r16;
mul.wide.u32 %rd57, %r276, 4;
add.s64 %rd8, %rd44, %rd57;
add.s64 %rd58, %rd44, %rd4;
mul.wide.s32 %rd59, %r8, 2;
add.s64 %rd9, %rd58, %rd59;
- and.b32 %r18, %r5, 31;
- add.s32 %r277, %r275, %r18;
+ and.b32 %r17, %r5, 31;
+ add.s32 %r277, %r275, %r17;
mul.wide.u32 %rd60, %r277, 4;
add.s64 %rd10, %rd44, %rd60;
add.s64 %rd11, %rd52, %rd56;
mul.wide.s32 %rd61, %r274, 4;
add.s64 %rd12, %rd44, %rd61;
cvt.rn.f32.f64 %f2, %fd1;
add.s64 %rd13, %rd45, %rd50;
cvta.to.global.u64 %rd15, %rd35;
cvta.to.global.u64 %rd16, %rd34;
- mov.u32 %r734, 0;
+ mov.u32 %r728, 0;
mov.f32 %f200, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd13; cvt.u32.u64 %r280, smem_ptr; }
- add.s32 %r281, %r14, %r280;
+ add.s32 %r281, %r280, %r14;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r283, smem_ptr; }
- add.s32 %r284, %r14, %r283;
+ add.s32 %r284, %r283, %r14;
mov.f32 %f687, %f200;
mov.f32 %f688, %f200;
mov.f32 %f689, %f200;
mov.f32 %f690, %f200;
mov.f32 %f691, %f200;
@@ -207,29 +207,29 @@
mov.f32 %f701, %f200;
mov.f32 %f702, %f200;
$L__BB0_7:
.pragma "nounroll";
- mul.lo.s32 %r23, %r734, %r4;
- add.s32 %r278, %r23, %r9;
- add.s32 %r24, %r278, %r13;
- setp.gt.s32 %p13, %r24, 215;
+ mul.lo.s32 %r22, %r728, %r4;
+ add.s32 %r278, %r22, %r9;
+ add.s32 %r23, %r278, %r13;
+ setp.gt.s32 %p13, %r23, 215;
mov.f32 %f703, %f200;
@%p13 bra $L__BB0_9;
- mul.lo.s32 %r279, %r24, %r212;
+ mul.lo.s32 %r279, %r23, %r211;
mul.wide.s32 %rd63, %r279, 4;
add.s64 %rd64, %rd16, %rd63;
ld.global.f32 %f703, [%rd64];
$L__BB0_9:
- setp.lt.s32 %p14, %r24, 216;
+ setp.lt.s32 %p14, %r23, 216;
and.pred %p2, %p1, %p14;
not.pred %p15, %p2;
@%p15 bra $L__BB0_11;
- mul.lo.s32 %r286, %r23, %r203;
+ mul.lo.s32 %r286, %r22, %r202;
cvt.s64.s32 %rd69, %r286;
add.s64 %rd70, %rd5, %rd69;
add.s64 %rd71, %rd70, %rd6;
shl.b64 %rd72, %rd71, 1;
add.s64 %rd66, %rd33, %rd72;
@@ -255,11 +255,11 @@
$L__BB0_11:
mov.f32 %f202, 0f00000000;
mov.f32 %f704, %f202;
@%p13 bra $L__BB0_13;
- mul.lo.s32 %r287, %r24, %r216;
+ mul.lo.s32 %r287, %r23, %r215;
mul.wide.s32 %rd73, %r287, 4;
add.s64 %rd74, %rd15, %rd73;
ld.global.f32 %f704, [%rd74];
$L__BB0_13:
@@ -478,23 +478,23 @@
mov.b32 %r330, %f303;
mov.u32 %r331, 1;
shfl.sync.bfly.b32 %r332|%p22, %r330, %r331, %r317, %r319;
mov.b32 %f304, %r332;
add.f32 %f724, %f303, %f304;
- shl.b32 %r735, %r735, 2;
- bar.sync 0;
- setp.ne.s32 %p23, %r18, 0;
+ shl.b32 %r729, %r729, 2;
+ bar.sync 0;
+ setp.ne.s32 %p23, %r17, 0;
@%p23 bra $L__BB0_18;
st.shared.f32 [%rd8], %f724;
$L__BB0_18:
- setp.ne.s32 %p24, %r17, 0;
+ setp.ne.s32 %p24, %r16, 0;
bar.sync 0;
@%p24 bra $L__BB0_22;
- setp.ge.u32 %p25, %r18, %r16;
+ setp.ge.u32 %p25, %r17, %r15;
mov.f32 %f723, 0f00000000;
@%p25 bra $L__BB0_21;
ld.shared.f32 %f723, [%rd10];
@@ -526,11 +526,11 @@
shfl.sync.bfly.b32 %r349|%p30, %r347, %r348, %r334, %r336;
mov.b32 %f314, %r349;
add.f32 %f724, %f313, %f314;
$L__BB0_22:
- setp.ne.s32 %p242, %r18, 0;
+ setp.ne.s32 %p242, %r17, 0;
bar.sync 0;
mov.b32 %r350, %f721;
mov.u32 %r351, 31;
mov.u32 %r352, 16;
mov.u32 %r353, -1;
@@ -556,23 +556,23 @@
mov.u32 %r365, 1;
shfl.sync.bfly.b32 %r366|%p35, %r364, %r365, %r351, %r353;
mov.b32 %f323, %r366;
add.f32 %f726, %f322, %f323;
add.f32 %f324, %f724, 0f00000000;
- setp.eq.s32 %p37, %r18, 0;
+ setp.eq.s32 %p37, %r17, 0;
selp.f32 %f66, %f324, 0f00000000, %p37;
bar.sync 0;
@%p242 bra $L__BB0_24;
st.shared.f32 [%rd8], %f726;
$L__BB0_24:
- setp.ne.s32 %p240, %r17, 0;
+ setp.ne.s32 %p240, %r16, 0;
bar.sync 0;
@%p240 bra $L__BB0_28;
- setp.ge.u32 %p39, %r18, %r16;
+ setp.ge.u32 %p39, %r17, %r15;
mov.f32 %f725, 0f00000000;
@%p39 bra $L__BB0_27;
ld.shared.f32 %f725, [%rd10];
@@ -615,11 +615,11 @@
bar.sync 0;
ld.shared.f32 %f71, [%rd12];
bar.sync 0;
@%p6 bra $L__BB0_32;
- setp.eq.s32 %p241, %r18, 0;
+ setp.eq.s32 %p241, %r17, 0;
add.f32 %f335, %f726, 0f00000000;
selp.f32 %f336, %f335, 0f00000000, %p241;
st.shared.f32 [%rd12], %f336;
$L__BB0_32:
@@ -633,11 +633,10 @@
@%p2 bra $L__BB0_34;
bra.uni $L__BB0_33;
$L__BB0_34:
- mul.lo.s32 %r732, %r734, %r4;
mul.f32 %f686, %f704, %f1;
ld.shared.v4.u32 {%r392, %r393, %r394, %r395}, [%rd7];
ld.shared.v4.u32 {%r400, %r401, %r402, %r403}, [%rd9];
ld.shared.v4.u32 {%r408, %r409, %r410, %r411}, [%rd11];
mov.b32 {%rs97, %rs101}, %r400;
@@ -846,13 +845,12 @@
{ cvt.rn.bf16.f32 %rs124, %f401;}
mov.b32 %r391, {%rs124, %rs128};
- add.s32 %r416, %r13, %r732;
- mad.lo.s32 %r417, %r416, %r203, %r15;
- mul.wide.s32 %rd76, %r417, 2;
+ mad.lo.s32 %r416, %r23, %r202, %r8;
+ mul.wide.s32 %rd76, %r416, 2;
add.s64 %rd75, %rd37, %rd76;
st.global.cs.v4.s32 [%rd75], {%r388,%r389,%r390,%r391};
bra.uni $L__BB0_35;
@@ -862,12 +860,12 @@
{ cvt.rn.bf16.f32 %rs61, %f337;}
$L__BB0_35:
- add.s32 %r734, %r734, 1;
- setp.lt.s32 %p49, %r734, %r12;
+ add.s32 %r728, %r728, 1;
+ setp.lt.s32 %p49, %r728, %r12;
@%p49 bra $L__BB0_7;
bra.uni $L__BB0_36;
$L__BB0_5:
mov.f32 %f687, 0f00000000;
@@ -886,68 +884,68 @@
mov.f32 %f700, %f687;
mov.f32 %f701, %f687;
mov.f32 %f702, %f687;
$L__BB0_36:
- mov.u32 %r418, %tid.z;
- mad.lo.s32 %r46, %r418, %r4, %r9;
- mad.lo.s32 %r47, %r46, %r3, %r5;
- mul.wide.u32 %rd77, %r47, 4;
+ mov.u32 %r417, %tid.z;
+ mad.lo.s32 %r45, %r417, %r4, %r9;
+ mad.lo.s32 %r46, %r45, %r3, %r5;
+ mul.wide.u32 %rd77, %r46, 4;
add.s64 %rd22, %rd44, %rd77;
st.shared.f32 [%rd22], %f702;
bar.sync 0;
- clz.b32 %r419, %r4;
- mov.u32 %r420, 31;
- sub.s32 %r48, %r420, %r419;
- mov.u32 %r421, 1;
- shl.b32 %r766, %r421, %r48;
- setp.lt.u32 %p50, %r9, %r766;
- add.s32 %r422, %r766, %r9;
- setp.lt.u32 %p51, %r422, %r4;
+ clz.b32 %r418, %r4;
+ mov.u32 %r419, 31;
+ sub.s32 %r47, %r419, %r418;
+ mov.u32 %r420, 1;
+ shl.b32 %r760, %r420, %r47;
+ setp.lt.u32 %p50, %r9, %r760;
+ add.s32 %r421, %r760, %r9;
+ setp.lt.u32 %p51, %r421, %r4;
and.pred %p3, %p50, %p51;
not.pred %p52, %p3;
@%p52 bra $L__BB0_38;
- shl.b32 %r423, %r3, %r48;
- add.s32 %r424, %r47, %r423;
- mul.wide.s32 %rd79, %r424, 4;
+ shl.b32 %r422, %r3, %r47;
+ add.s32 %r423, %r46, %r422;
+ mul.wide.s32 %rd79, %r423, 4;
add.s64 %rd81, %rd44, %rd79;
ld.shared.f32 %f462, [%rd22];
ld.shared.f32 %f463, [%rd81];
add.f32 %f464, %f463, %f462;
st.shared.f32 [%rd22], %f464;
$L__BB0_38:
bar.sync 0;
- setp.lt.s32 %p53, %r766, 4;
+ setp.lt.s32 %p53, %r760, 4;
@%p53 bra $L__BB0_43;
- mov.u32 %r736, %r766;
+ mov.u32 %r730, %r760;
$L__BB0_40:
- shr.u32 %r51, %r736, 1;
- setp.ge.u32 %p54, %r9, %r51;
+ shr.u32 %r50, %r730, 1;
+ setp.ge.u32 %p54, %r9, %r50;
@%p54 bra $L__BB0_42;
- mad.lo.s32 %r425, %r51, %r3, %r47;
- mul.wide.s32 %rd82, %r425, 4;
+ mad.lo.s32 %r424, %r50, %r3, %r46;
+ mul.wide.s32 %rd82, %r424, 4;
add.s64 %rd84, %rd44, %rd82;
ld.shared.f32 %f465, [%rd22];
ld.shared.f32 %f466, [%rd84];
add.f32 %f467, %f466, %f465;
st.shared.f32 [%rd22], %f467;
$L__BB0_42:
bar.sync 0;
- setp.gt.u32 %p55, %r736, 7;
- mov.u32 %r736, %r51;
+ setp.gt.u32 %p55, %r730, 7;
+ mov.u32 %r730, %r50;
@%p55 bra $L__BB0_40;
$L__BB0_43:
- mov.u32 %r737, 0;
- add.s32 %r427, %r47, %r3;
- mul.wide.u32 %rd85, %r427, 4;
+ mov.u32 %r731, 0;
+ add.s32 %r426, %r46, %r3;
+ mul.wide.u32 %rd85, %r426, 4;
add.s64 %rd23, %rd44, %rd85;
@%p10 bra $L__BB0_47;
ld.shared.f32 %f468, [%rd22];
add.f32 %f743, %f468, 0f00000000;
@@ -956,54 +954,54 @@
ld.shared.f32 %f469, [%rd23];
add.f32 %f743, %f743, %f469;
$L__BB0_46:
- mov.b32 %r737, %f743;
+ mov.b32 %r731, %f743;
$L__BB0_47:
bar.sync 0;
st.shared.f32 [%rd22], %f701;
bar.sync 0;
@%p52 bra $L__BB0_49;
- shl.b32 %r428, %r3, %r48;
- add.s32 %r429, %r47, %r428;
- mul.wide.s32 %rd87, %r429, 4;
+ shl.b32 %r427, %r3, %r47;
+ add.s32 %r428, %r46, %r427;
+ mul.wide.s32 %rd87, %r428, 4;
add.s64 %rd89, %rd44, %rd87;
ld.shared.f32 %f470, [%rd22];
ld.shared.f32 %f471, [%rd89];
add.f32 %f472, %f471, %f470;
st.shared.f32 [%rd22], %f472;
$L__BB0_49:
bar.sync 0;
@%p53 bra $L__BB0_54;
- mov.u32 %r738, %r766;
+ mov.u32 %r732, %r760;
$L__BB0_51:
- shr.u32 %r55, %r738, 1;
- setp.ge.u32 %p60, %r9, %r55;
+ shr.u32 %r54, %r732, 1;
+ setp.ge.u32 %p60, %r9, %r54;
@%p60 bra $L__BB0_53;
- mad.lo.s32 %r430, %r55, %r3, %r47;
- mul.wide.s32 %rd90, %r430, 4;
+ mad.lo.s32 %r429, %r54, %r3, %r46;
+ mul.wide.s32 %rd90, %r429, 4;
add.s64 %rd92, %rd44, %rd90;
ld.shared.f32 %f473, [%rd22];
ld.shared.f32 %f474, [%rd92];
add.f32 %f475, %f474, %f473;
st.shared.f32 [%rd22], %f475;
$L__BB0_53:
bar.sync 0;
- setp.gt.u32 %p61, %r738, 7;
- mov.u32 %r738, %r55;
+ setp.gt.u32 %p61, %r732, 7;
+ mov.u32 %r732, %r54;
@%p61 bra $L__BB0_51;
$L__BB0_54:
- mov.u32 %r739, 0;
+ mov.u32 %r733, 0;
@%p10 bra $L__BB0_58;
ld.shared.f32 %f476, [%rd22];
add.f32 %f744, %f476, 0f00000000;
setp.lt.u32 %p63, %r4, 2;
@@ -1011,54 +1009,54 @@
ld.shared.f32 %f477, [%rd23];
add.f32 %f744, %f744, %f477;
$L__BB0_57:
- mov.b32 %r739, %f744;
+ mov.b32 %r733, %f744;
$L__BB0_58:
bar.sync 0;
st.shared.f32 [%rd22], %f700;
bar.sync 0;
@%p52 bra $L__BB0_60;
- shl.b32 %r432, %r3, %r48;
- add.s32 %r433, %r47, %r432;
- mul.wide.s32 %rd93, %r433, 4;
+ shl.b32 %r431, %r3, %r47;
+ add.s32 %r432, %r46, %r431;
+ mul.wide.s32 %rd93, %r432, 4;
add.s64 %rd95, %rd44, %rd93;
ld.shared.f32 %f478, [%rd22];
ld.shared.f32 %f479, [%rd95];
add.f32 %f480, %f479, %f478;
st.shared.f32 [%rd22], %f480;
$L__BB0_60:
bar.sync 0;
@%p53 bra $L__BB0_65;
- mov.u32 %r740, %r766;
+ mov.u32 %r734, %r760;
$L__BB0_62:
- shr.u32 %r59, %r740, 1;
- setp.ge.u32 %p66, %r9, %r59;
+ shr.u32 %r58, %r734, 1;
+ setp.ge.u32 %p66, %r9, %r58;
@%p66 bra $L__BB0_64;
- mad.lo.s32 %r434, %r59, %r3, %r47;
- mul.wide.s32 %rd96, %r434, 4;
+ mad.lo.s32 %r433, %r58, %r3, %r46;
+ mul.wide.s32 %rd96, %r433, 4;
add.s64 %rd98, %rd44, %rd96;
ld.shared.f32 %f481, [%rd22];
ld.shared.f32 %f482, [%rd98];
add.f32 %f483, %f482, %f481;
st.shared.f32 [%rd22], %f483;
$L__BB0_64:
bar.sync 0;
- setp.gt.u32 %p67, %r740, 7;
- mov.u32 %r740, %r59;
+ setp.gt.u32 %p67, %r734, 7;
+ mov.u32 %r734, %r58;
@%p67 bra $L__BB0_62;
$L__BB0_65:
- mov.u32 %r741, 0;
+ mov.u32 %r735, 0;
@%p10 bra $L__BB0_69;
ld.shared.f32 %f484, [%rd22];
add.f32 %f745, %f484, 0f00000000;
setp.lt.u32 %p69, %r4, 2;
@@ -1066,54 +1064,54 @@
ld.shared.f32 %f485, [%rd23];
add.f32 %f745, %f745, %f485;
$L__BB0_68:
- mov.b32 %r741, %f745;
+ mov.b32 %r735, %f745;
$L__BB0_69:
bar.sync 0;
st.shared.f32 [%rd22], %f699;
bar.sync 0;
@%p52 bra $L__BB0_71;
- shl.b32 %r436, %r3, %r48;
- add.s32 %r437, %r47, %r436;
- mul.wide.s32 %rd99, %r437, 4;
+ shl.b32 %r435, %r3, %r47;
+ add.s32 %r436, %r46, %r435;
+ mul.wide.s32 %rd99, %r436, 4;
add.s64 %rd101, %rd44, %rd99;
ld.shared.f32 %f486, [%rd22];
ld.shared.f32 %f487, [%rd101];
add.f32 %f488, %f487, %f486;
st.shared.f32 [%rd22], %f488;
$L__BB0_71:
bar.sync 0;
@%p53 bra $L__BB0_76;
- mov.u32 %r742, %r766;
+ mov.u32 %r736, %r760;
$L__BB0_73:
- shr.u32 %r63, %r742, 1;
- setp.ge.u32 %p72, %r9, %r63;
+ shr.u32 %r62, %r736, 1;
+ setp.ge.u32 %p72, %r9, %r62;
@%p72 bra $L__BB0_75;
- mad.lo.s32 %r438, %r63, %r3, %r47;
- mul.wide.s32 %rd102, %r438, 4;
+ mad.lo.s32 %r437, %r62, %r3, %r46;
+ mul.wide.s32 %rd102, %r437, 4;
add.s64 %rd104, %rd44, %rd102;
ld.shared.f32 %f489, [%rd22];
ld.shared.f32 %f490, [%rd104];
add.f32 %f491, %f490, %f489;
st.shared.f32 [%rd22], %f491;
$L__BB0_75:
bar.sync 0;
- setp.gt.u32 %p73, %r742, 7;
- mov.u32 %r742, %r63;
+ setp.gt.u32 %p73, %r736, 7;
+ mov.u32 %r736, %r62;
@%p73 bra $L__BB0_73;
$L__BB0_76:
- mov.u32 %r743, 0;
+ mov.u32 %r737, 0;
@%p10 bra $L__BB0_80;
ld.shared.f32 %f492, [%rd22];
add.f32 %f746, %f492, 0f00000000;
setp.lt.u32 %p75, %r4, 2;
@@ -1121,54 +1119,54 @@
ld.shared.f32 %f493, [%rd23];
add.f32 %f746, %f746, %f493;
$L__BB0_79:
- mov.b32 %r743, %f746;
+ mov.b32 %r737, %f746;
$L__BB0_80:
bar.sync 0;
st.shared.f32 [%rd22], %f698;
bar.sync 0;
@%p52 bra $L__BB0_82;
- shl.b32 %r440, %r3, %r48;
- add.s32 %r441, %r47, %r440;
- mul.wide.s32 %rd105, %r441, 4;
+ shl.b32 %r439, %r3, %r47;
+ add.s32 %r440, %r46, %r439;
+ mul.wide.s32 %rd105, %r440, 4;
add.s64 %rd107, %rd44, %rd105;
ld.shared.f32 %f494, [%rd22];
ld.shared.f32 %f495, [%rd107];
add.f32 %f496, %f495, %f494;
st.shared.f32 [%rd22], %f496;
$L__BB0_82:
bar.sync 0;
@%p53 bra $L__BB0_87;
- mov.u32 %r744, %r766;
+ mov.u32 %r738, %r760;
$L__BB0_84:
- shr.u32 %r67, %r744, 1;
- setp.ge.u32 %p78, %r9, %r67;
+ shr.u32 %r66, %r738, 1;
+ setp.ge.u32 %p78, %r9, %r66;
@%p78 bra $L__BB0_86;
- mad.lo.s32 %r442, %r67, %r3, %r47;
- mul.wide.s32 %rd108, %r442, 4;
+ mad.lo.s32 %r441, %r66, %r3, %r46;
+ mul.wide.s32 %rd108, %r441, 4;
add.s64 %rd110, %rd44, %rd108;
ld.shared.f32 %f497, [%rd22];
ld.shared.f32 %f498, [%rd110];
add.f32 %f499, %f498, %f497;
st.shared.f32 [%rd22], %f499;
$L__BB0_86:
bar.sync 0;
- setp.gt.u32 %p79, %r744, 7;
- mov.u32 %r744, %r67;
+ setp.gt.u32 %p79, %r738, 7;
+ mov.u32 %r738, %r66;
@%p79 bra $L__BB0_84;
$L__BB0_87:
- mov.u32 %r745, 0;
+ mov.u32 %r739, 0;
@%p10 bra $L__BB0_91;
ld.shared.f32 %f500, [%rd22];
add.f32 %f747, %f500, 0f00000000;
setp.lt.u32 %p81, %r4, 2;
@@ -1176,54 +1174,54 @@
ld.shared.f32 %f501, [%rd23];
add.f32 %f747, %f747, %f501;
$L__BB0_90:
- mov.b32 %r745, %f747;
+ mov.b32 %r739, %f747;
$L__BB0_91:
bar.sync 0;
st.shared.f32 [%rd22], %f697;
bar.sync 0;
@%p52 bra $L__BB0_93;
- shl.b32 %r444, %r3, %r48;
- add.s32 %r445, %r47, %r444;
- mul.wide.s32 %rd111, %r445, 4;
+ shl.b32 %r443, %r3, %r47;
+ add.s32 %r444, %r46, %r443;
+ mul.wide.s32 %rd111, %r444, 4;
add.s64 %rd113, %rd44, %rd111;
ld.shared.f32 %f502, [%rd22];
ld.shared.f32 %f503, [%rd113];
add.f32 %f504, %f503, %f502;
st.shared.f32 [%rd22], %f504;
$L__BB0_93:
bar.sync 0;
@%p53 bra $L__BB0_98;
- mov.u32 %r746, %r766;
+ mov.u32 %r740, %r760;
$L__BB0_95:
- shr.u32 %r71, %r746, 1;
- setp.ge.u32 %p84, %r9, %r71;
+ shr.u32 %r70, %r740, 1;
+ setp.ge.u32 %p84, %r9, %r70;
@%p84 bra $L__BB0_97;
- mad.lo.s32 %r446, %r71, %r3, %r47;
- mul.wide.s32 %rd114, %r446, 4;
+ mad.lo.s32 %r445, %r70, %r3, %r46;
+ mul.wide.s32 %rd114, %r445, 4;
add.s64 %rd116, %rd44, %rd114;
ld.shared.f32 %f505, [%rd22];
ld.shared.f32 %f506, [%rd116];
add.f32 %f507, %f506, %f505;
st.shared.f32 [%rd22], %f507;
$L__BB0_97:
bar.sync 0;
- setp.gt.u32 %p85, %r746, 7;
- mov.u32 %r746, %r71;
+ setp.gt.u32 %p85, %r740, 7;
+ mov.u32 %r740, %r70;
@%p85 bra $L__BB0_95;
$L__BB0_98:
- mov.u32 %r747, 0;
+ mov.u32 %r741, 0;
@%p10 bra $L__BB0_102;
ld.shared.f32 %f508, [%rd22];
add.f32 %f748, %f508, 0f00000000;
setp.lt.u32 %p87, %r4, 2;
@@ -1231,54 +1229,54 @@
ld.shared.f32 %f509, [%rd23];
add.f32 %f748, %f748, %f509;
$L__BB0_101:
- mov.b32 %r747, %f748;
+ mov.b32 %r741, %f748;
$L__BB0_102:
bar.sync 0;
st.shared.f32 [%rd22], %f696;
bar.sync 0;
@%p52 bra $L__BB0_104;
- shl.b32 %r448, %r3, %r48;
- add.s32 %r449, %r47, %r448;
- mul.wide.s32 %rd117, %r449, 4;
+ shl.b32 %r447, %r3, %r47;
+ add.s32 %r448, %r46, %r447;
+ mul.wide.s32 %rd117, %r448, 4;
add.s64 %rd119, %rd44, %rd117;
ld.shared.f32 %f510, [%rd22];
ld.shared.f32 %f511, [%rd119];
add.f32 %f512, %f511, %f510;
st.shared.f32 [%rd22], %f512;
$L__BB0_104:
bar.sync 0;
@%p53 bra $L__BB0_109;
- mov.u32 %r748, %r766;
+ mov.u32 %r742, %r760;
$L__BB0_106:
- shr.u32 %r75, %r748, 1;
- setp.ge.u32 %p90, %r9, %r75;
+ shr.u32 %r74, %r742, 1;
+ setp.ge.u32 %p90, %r9, %r74;
@%p90 bra $L__BB0_108;
- mad.lo.s32 %r450, %r75, %r3, %r47;
- mul.wide.s32 %rd120, %r450, 4;
+ mad.lo.s32 %r449, %r74, %r3, %r46;
+ mul.wide.s32 %rd120, %r449, 4;
add.s64 %rd122, %rd44, %rd120;
ld.shared.f32 %f513, [%rd22];
ld.shared.f32 %f514, [%rd122];
add.f32 %f515, %f514, %f513;
st.shared.f32 [%rd22], %f515;
$L__BB0_108:
bar.sync 0;
- setp.gt.u32 %p91, %r748, 7;
- mov.u32 %r748, %r75;
+ setp.gt.u32 %p91, %r742, 7;
+ mov.u32 %r742, %r74;
@%p91 bra $L__BB0_106;
$L__BB0_109:
- mov.u32 %r749, 0;
+ mov.u32 %r743, 0;
@%p10 bra $L__BB0_113;
ld.shared.f32 %f516, [%rd22];
add.f32 %f749, %f516, 0f00000000;
setp.lt.u32 %p93, %r4, 2;
@@ -1286,54 +1284,54 @@
ld.shared.f32 %f517, [%rd23];
add.f32 %f749, %f749, %f517;
$L__BB0_112:
- mov.b32 %r749, %f749;
+ mov.b32 %r743, %f749;
$L__BB0_113:
bar.sync 0;
st.shared.f32 [%rd22], %f695;
bar.sync 0;
@%p52 bra $L__BB0_115;
- shl.b32 %r452, %r3, %r48;
- add.s32 %r453, %r47, %r452;
- mul.wide.s32 %rd123, %r453, 4;
+ shl.b32 %r451, %r3, %r47;
+ add.s32 %r452, %r46, %r451;
+ mul.wide.s32 %rd123, %r452, 4;
add.s64 %rd125, %rd44, %rd123;
ld.shared.f32 %f518, [%rd22];
ld.shared.f32 %f519, [%rd125];
add.f32 %f520, %f519, %f518;
st.shared.f32 [%rd22], %f520;
$L__BB0_115:
bar.sync 0;
@%p53 bra $L__BB0_120;
- mov.u32 %r750, %r766;
+ mov.u32 %r744, %r760;
$L__BB0_117:
- shr.u32 %r79, %r750, 1;
- setp.ge.u32 %p96, %r9, %r79;
+ shr.u32 %r78, %r744, 1;
+ setp.ge.u32 %p96, %r9, %r78;
@%p96 bra $L__BB0_119;
- mad.lo.s32 %r454, %r79, %r3, %r47;
- mul.wide.s32 %rd126, %r454, 4;
+ mad.lo.s32 %r453, %r78, %r3, %r46;
+ mul.wide.s32 %rd126, %r453, 4;
add.s64 %rd128, %rd44, %rd126;
ld.shared.f32 %f521, [%rd22];
ld.shared.f32 %f522, [%rd128];
add.f32 %f523, %f522, %f521;
st.shared.f32 [%rd22], %f523;
$L__BB0_119:
bar.sync 0;
- setp.gt.u32 %p97, %r750, 7;
- mov.u32 %r750, %r79;
+ setp.gt.u32 %p97, %r744, 7;
+ mov.u32 %r744, %r78;
@%p97 bra $L__BB0_117;
$L__BB0_120:
- mov.u32 %r751, 0;
+ mov.u32 %r745, 0;
@%p10 bra $L__BB0_124;
ld.shared.f32 %f524, [%rd22];
add.f32 %f750, %f524, 0f00000000;
setp.lt.u32 %p99, %r4, 2;
@@ -1341,55 +1339,55 @@
ld.shared.f32 %f525, [%rd23];
add.f32 %f750, %f750, %f525;
$L__BB0_123:
- mov.b32 %r751, %f750;
+ mov.b32 %r745, %f750;
$L__BB0_124:
bar.sync 0;
- shl.b32 %r82, %r735, 4;
+ shl.b32 %r81, %r729, 4;
st.shared.f32 [%rd22], %f694;
bar.sync 0;
@%p52 bra $L__BB0_126;
- shl.b32 %r456, %r3, %r48;
- add.s32 %r457, %r47, %r456;
- mul.wide.s32 %rd129, %r457, 4;
+ shl.b32 %r455, %r3, %r47;
+ add.s32 %r456, %r46, %r455;
+ mul.wide.s32 %rd129, %r456, 4;
add.s64 %rd131, %rd44, %rd129;
ld.shared.f32 %f526, [%rd22];
ld.shared.f32 %f527, [%rd131];
add.f32 %f528, %f527, %f526;
st.shared.f32 [%rd22], %f528;
$L__BB0_126:
bar.sync 0;
@%p53 bra $L__BB0_131;
- mov.u32 %r752, %r766;
+ mov.u32 %r746, %r760;
$L__BB0_128:
- shr.u32 %r84, %r752, 1;
- setp.ge.u32 %p102, %r9, %r84;
+ shr.u32 %r83, %r746, 1;
+ setp.ge.u32 %p102, %r9, %r83;
@%p102 bra $L__BB0_130;
- mad.lo.s32 %r458, %r84, %r3, %r47;
- mul.wide.s32 %rd132, %r458, 4;
+ mad.lo.s32 %r457, %r83, %r3, %r46;
+ mul.wide.s32 %rd132, %r457, 4;
add.s64 %rd134, %rd44, %rd132;
ld.shared.f32 %f529, [%rd22];
ld.shared.f32 %f530, [%rd134];
add.f32 %f531, %f530, %f529;
st.shared.f32 [%rd22], %f531;
$L__BB0_130:
bar.sync 0;
- setp.gt.u32 %p103, %r752, 7;
- mov.u32 %r752, %r84;
+ setp.gt.u32 %p103, %r746, 7;
+ mov.u32 %r746, %r83;
@%p103 bra $L__BB0_128;
$L__BB0_131:
- mov.u32 %r753, 0;
+ mov.u32 %r747, 0;
@%p10 bra $L__BB0_135;
ld.shared.f32 %f532, [%rd22];
add.f32 %f751, %f532, 0f00000000;
setp.lt.u32 %p105, %r4, 2;
@@ -1397,54 +1395,54 @@
ld.shared.f32 %f533, [%rd23];
add.f32 %f751, %f751, %f533;
$L__BB0_134:
- mov.b32 %r753, %f751;
+ mov.b32 %r747, %f751;
$L__BB0_135:
bar.sync 0;
st.shared.f32 [%rd22], %f693;
bar.sync 0;
@%p52 bra $L__BB0_137;
- shl.b32 %r460, %r3, %r48;
- add.s32 %r461, %r47, %r460;
- mul.wide.s32 %rd135, %r461, 4;
+ shl.b32 %r459, %r3, %r47;
+ add.s32 %r460, %r46, %r459;
+ mul.wide.s32 %rd135, %r460, 4;
add.s64 %rd137, %rd44, %rd135;
ld.shared.f32 %f534, [%rd22];
ld.shared.f32 %f535, [%rd137];
add.f32 %f536, %f535, %f534;
st.shared.f32 [%rd22], %f536;
$L__BB0_137:
bar.sync 0;
@%p53 bra $L__BB0_142;
- mov.u32 %r754, %r766;
+ mov.u32 %r748, %r760;
$L__BB0_139:
- shr.u32 %r88, %r754, 1;
- setp.ge.u32 %p108, %r9, %r88;
+ shr.u32 %r87, %r748, 1;
+ setp.ge.u32 %p108, %r9, %r87;
@%p108 bra $L__BB0_141;
- mad.lo.s32 %r462, %r88, %r3, %r47;
- mul.wide.s32 %rd138, %r462, 4;
+ mad.lo.s32 %r461, %r87, %r3, %r46;
+ mul.wide.s32 %rd138, %r461, 4;
add.s64 %rd140, %rd44, %rd138;
ld.shared.f32 %f537, [%rd22];
ld.shared.f32 %f538, [%rd140];
add.f32 %f539, %f538, %f537;
st.shared.f32 [%rd22], %f539;
$L__BB0_141:
bar.sync 0;
- setp.gt.u32 %p109, %r754, 7;
- mov.u32 %r754, %r88;
+ setp.gt.u32 %p109, %r748, 7;
+ mov.u32 %r748, %r87;
@%p109 bra $L__BB0_139;
$L__BB0_142:
- mov.u32 %r755, 0;
+ mov.u32 %r749, 0;
@%p10 bra $L__BB0_146;
ld.shared.f32 %f540, [%rd22];
add.f32 %f752, %f540, 0f00000000;
setp.lt.u32 %p111, %r4, 2;
@@ -1452,54 +1450,54 @@
ld.shared.f32 %f541, [%rd23];
add.f32 %f752, %f752, %f541;
$L__BB0_145:
- mov.b32 %r755, %f752;
+ mov.b32 %r749, %f752;
$L__BB0_146:
bar.sync 0;
st.shared.f32 [%rd22], %f692;
bar.sync 0;
@%p52 bra $L__BB0_148;
- shl.b32 %r464, %r3, %r48;
- add.s32 %r465, %r47, %r464;
- mul.wide.s32 %rd141, %r465, 4;
+ shl.b32 %r463, %r3, %r47;
+ add.s32 %r464, %r46, %r463;
+ mul.wide.s32 %rd141, %r464, 4;
add.s64 %rd143, %rd44, %rd141;
ld.shared.f32 %f542, [%rd22];
ld.shared.f32 %f543, [%rd143];
add.f32 %f544, %f543, %f542;
st.shared.f32 [%rd22], %f544;
$L__BB0_148:
bar.sync 0;
@%p53 bra $L__BB0_153;
- mov.u32 %r756, %r766;
+ mov.u32 %r750, %r760;
$L__BB0_150:
- shr.u32 %r92, %r756, 1;
- setp.ge.u32 %p114, %r9, %r92;
+ shr.u32 %r91, %r750, 1;
+ setp.ge.u32 %p114, %r9, %r91;
@%p114 bra $L__BB0_152;
- mad.lo.s32 %r466, %r92, %r3, %r47;
- mul.wide.s32 %rd144, %r466, 4;
+ mad.lo.s32 %r465, %r91, %r3, %r46;
+ mul.wide.s32 %rd144, %r465, 4;
add.s64 %rd146, %rd44, %rd144;
ld.shared.f32 %f545, [%rd22];
ld.shared.f32 %f546, [%rd146];
add.f32 %f547, %f546, %f545;
st.shared.f32 [%rd22], %f547;
$L__BB0_152:
bar.sync 0;
- setp.gt.u32 %p115, %r756, 7;
- mov.u32 %r756, %r92;
+ setp.gt.u32 %p115, %r750, 7;
+ mov.u32 %r750, %r91;
@%p115 bra $L__BB0_150;
$L__BB0_153:
- mov.u32 %r757, 0;
+ mov.u32 %r751, 0;
@%p10 bra $L__BB0_157;
ld.shared.f32 %f548, [%rd22];
add.f32 %f753, %f548, 0f00000000;
setp.lt.u32 %p117, %r4, 2;
@@ -1507,54 +1505,54 @@
ld.shared.f32 %f549, [%rd23];
add.f32 %f753, %f753, %f549;
$L__BB0_156:
- mov.b32 %r757, %f753;
+ mov.b32 %r751, %f753;
$L__BB0_157:
bar.sync 0;
st.shared.f32 [%rd22], %f691;
bar.sync 0;
@%p52 bra $L__BB0_159;
- shl.b32 %r468, %r3, %r48;
- add.s32 %r469, %r47, %r468;
- mul.wide.s32 %rd147, %r469, 4;
+ shl.b32 %r467, %r3, %r47;
+ add.s32 %r468, %r46, %r467;
+ mul.wide.s32 %rd147, %r468, 4;
add.s64 %rd149, %rd44, %rd147;
ld.shared.f32 %f550, [%rd22];
ld.shared.f32 %f551, [%rd149];
add.f32 %f552, %f551, %f550;
st.shared.f32 [%rd22], %f552;
$L__BB0_159:
bar.sync 0;
@%p53 bra $L__BB0_164;
- mov.u32 %r758, %r766;
+ mov.u32 %r752, %r760;
$L__BB0_161:
- shr.u32 %r96, %r758, 1;
- setp.ge.u32 %p120, %r9, %r96;
+ shr.u32 %r95, %r752, 1;
+ setp.ge.u32 %p120, %r9, %r95;
@%p120 bra $L__BB0_163;
- mad.lo.s32 %r470, %r96, %r3, %r47;
- mul.wide.s32 %rd150, %r470, 4;
+ mad.lo.s32 %r469, %r95, %r3, %r46;
+ mul.wide.s32 %rd150, %r469, 4;
add.s64 %rd152, %rd44, %rd150;
ld.shared.f32 %f553, [%rd22];
ld.shared.f32 %f554, [%rd152];
add.f32 %f555, %f554, %f553;
st.shared.f32 [%rd22], %f555;
$L__BB0_163:
bar.sync 0;
- setp.gt.u32 %p121, %r758, 7;
- mov.u32 %r758, %r96;
+ setp.gt.u32 %p121, %r752, 7;
+ mov.u32 %r752, %r95;
@%p121 bra $L__BB0_161;
$L__BB0_164:
- mov.u32 %r759, 0;
+ mov.u32 %r753, 0;
@%p10 bra $L__BB0_168;
ld.shared.f32 %f556, [%rd22];
add.f32 %f754, %f556, 0f00000000;
setp.lt.u32 %p123, %r4, 2;
@@ -1562,54 +1560,54 @@
ld.shared.f32 %f557, [%rd23];
add.f32 %f754, %f754, %f557;
$L__BB0_167:
- mov.b32 %r759, %f754;
+ mov.b32 %r753, %f754;
$L__BB0_168:
bar.sync 0;
st.shared.f32 [%rd22], %f690;
bar.sync 0;
@%p52 bra $L__BB0_170;
- shl.b32 %r472, %r3, %r48;
- add.s32 %r473, %r47, %r472;
- mul.wide.s32 %rd153, %r473, 4;
+ shl.b32 %r471, %r3, %r47;
+ add.s32 %r472, %r46, %r471;
+ mul.wide.s32 %rd153, %r472, 4;
add.s64 %rd155, %rd44, %rd153;
ld.shared.f32 %f558, [%rd22];
ld.shared.f32 %f559, [%rd155];
add.f32 %f560, %f559, %f558;
st.shared.f32 [%rd22], %f560;
$L__BB0_170:
bar.sync 0;
@%p53 bra $L__BB0_175;
- mov.u32 %r760, %r766;
+ mov.u32 %r754, %r760;
$L__BB0_172:
- shr.u32 %r100, %r760, 1;
- setp.ge.u32 %p126, %r9, %r100;
+ shr.u32 %r99, %r754, 1;
+ setp.ge.u32 %p126, %r9, %r99;
@%p126 bra $L__BB0_174;
- mad.lo.s32 %r474, %r100, %r3, %r47;
- mul.wide.s32 %rd156, %r474, 4;
+ mad.lo.s32 %r473, %r99, %r3, %r46;
+ mul.wide.s32 %rd156, %r473, 4;
add.s64 %rd158, %rd44, %rd156;
ld.shared.f32 %f561, [%rd22];
ld.shared.f32 %f562, [%rd158];
add.f32 %f563, %f562, %f561;
st.shared.f32 [%rd22], %f563;
$L__BB0_174:
bar.sync 0;
- setp.gt.u32 %p127, %r760, 7;
- mov.u32 %r760, %r100;
+ setp.gt.u32 %p127, %r754, 7;
+ mov.u32 %r754, %r99;
@%p127 bra $L__BB0_172;
$L__BB0_175:
- mov.u32 %r761, 0;
+ mov.u32 %r755, 0;
@%p10 bra $L__BB0_179;
ld.shared.f32 %f564, [%rd22];
add.f32 %f755, %f564, 0f00000000;
setp.lt.u32 %p129, %r4, 2;
@@ -1617,54 +1615,54 @@
ld.shared.f32 %f565, [%rd23];
add.f32 %f755, %f755, %f565;
$L__BB0_178:
- mov.b32 %r761, %f755;
+ mov.b32 %r755, %f755;
$L__BB0_179:
bar.sync 0;
st.shared.f32 [%rd22], %f689;
bar.sync 0;
@%p52 bra $L__BB0_181;
- shl.b32 %r476, %r3, %r48;
- add.s32 %r477, %r47, %r476;
- mul.wide.s32 %rd159, %r477, 4;
+ shl.b32 %r475, %r3, %r47;
+ add.s32 %r476, %r46, %r475;
+ mul.wide.s32 %rd159, %r476, 4;
add.s64 %rd161, %rd44, %rd159;
ld.shared.f32 %f566, [%rd22];
ld.shared.f32 %f567, [%rd161];
add.f32 %f568, %f567, %f566;
st.shared.f32 [%rd22], %f568;
$L__BB0_181:
bar.sync 0;
@%p53 bra $L__BB0_186;
- mov.u32 %r762, %r766;
+ mov.u32 %r756, %r760;
$L__BB0_183:
- shr.u32 %r104, %r762, 1;
- setp.ge.u32 %p132, %r9, %r104;
+ shr.u32 %r103, %r756, 1;
+ setp.ge.u32 %p132, %r9, %r103;
@%p132 bra $L__BB0_185;
- mad.lo.s32 %r478, %r104, %r3, %r47;
- mul.wide.s32 %rd162, %r478, 4;
+ mad.lo.s32 %r477, %r103, %r3, %r46;
+ mul.wide.s32 %rd162, %r477, 4;
add.s64 %rd164, %rd44, %rd162;
ld.shared.f32 %f569, [%rd22];
ld.shared.f32 %f570, [%rd164];
add.f32 %f571, %f570, %f569;
st.shared.f32 [%rd22], %f571;
$L__BB0_185:
bar.sync 0;
- setp.gt.u32 %p133, %r762, 7;
- mov.u32 %r762, %r104;
+ setp.gt.u32 %p133, %r756, 7;
+ mov.u32 %r756, %r103;
@%p133 bra $L__BB0_183;
$L__BB0_186:
- mov.u32 %r763, 0;
+ mov.u32 %r757, 0;
@%p10 bra $L__BB0_190;
ld.shared.f32 %f572, [%rd22];
add.f32 %f756, %f572, 0f00000000;
setp.lt.u32 %p135, %r4, 2;
@@ -1672,54 +1670,54 @@
ld.shared.f32 %f573, [%rd23];
add.f32 %f756, %f756, %f573;
$L__BB0_189:
- mov.b32 %r763, %f756;
+ mov.b32 %r757, %f756;
$L__BB0_190:
bar.sync 0;
st.shared.f32 [%rd22], %f688;
bar.sync 0;
@%p52 bra $L__BB0_192;
- shl.b32 %r480, %r3, %r48;
- add.s32 %r481, %r47, %r480;
- mul.wide.s32 %rd165, %r481, 4;
+ shl.b32 %r479, %r3, %r47;
+ add.s32 %r480, %r46, %r479;
+ mul.wide.s32 %rd165, %r480, 4;
add.s64 %rd167, %rd44, %rd165;
ld.shared.f32 %f574, [%rd22];
ld.shared.f32 %f575, [%rd167];
add.f32 %f576, %f575, %f574;
st.shared.f32 [%rd22], %f576;
$L__BB0_192:
bar.sync 0;
@%p53 bra $L__BB0_197;
- mov.u32 %r764, %r766;
+ mov.u32 %r758, %r760;
$L__BB0_194:
- shr.u32 %r108, %r764, 1;
- setp.ge.u32 %p138, %r9, %r108;
+ shr.u32 %r107, %r758, 1;
+ setp.ge.u32 %p138, %r9, %r107;
@%p138 bra $L__BB0_196;
- mad.lo.s32 %r482, %r108, %r3, %r47;
- mul.wide.s32 %rd168, %r482, 4;
+ mad.lo.s32 %r481, %r107, %r3, %r46;
+ mul.wide.s32 %rd168, %r481, 4;
add.s64 %rd170, %rd44, %rd168;
ld.shared.f32 %f577, [%rd22];
ld.shared.f32 %f578, [%rd170];
add.f32 %f579, %f578, %f577;
st.shared.f32 [%rd22], %f579;
$L__BB0_196:
bar.sync 0;
- setp.gt.u32 %p139, %r764, 7;
- mov.u32 %r764, %r108;
+ setp.gt.u32 %p139, %r758, 7;
+ mov.u32 %r758, %r107;
@%p139 bra $L__BB0_194;
$L__BB0_197:
- mov.u32 %r765, 0;
+ mov.u32 %r759, 0;
@%p10 bra $L__BB0_201;
ld.shared.f32 %f580, [%rd22];
add.f32 %f757, %f580, 0f00000000;
setp.lt.u32 %p141, %r4, 2;
@@ -1727,21 +1725,21 @@
ld.shared.f32 %f581, [%rd23];
add.f32 %f757, %f757, %f581;
$L__BB0_200:
- mov.b32 %r765, %f757;
+ mov.b32 %r759, %f757;
$L__BB0_201:
bar.sync 0;
st.shared.f32 [%rd22], %f687;
bar.sync 0;
@%p52 bra $L__BB0_203;
- shl.b32 %r484, %r3, %r48;
- add.s32 %r485, %r47, %r484;
- mul.wide.s32 %rd171, %r485, 4;
+ shl.b32 %r483, %r3, %r47;
+ add.s32 %r484, %r46, %r483;
+ mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd44, %rd171;
ld.shared.f32 %f582, [%rd22];
ld.shared.f32 %f583, [%rd173];
add.f32 %f584, %f583, %f582;
st.shared.f32 [%rd22], %f584;
@@ -1749,30 +1747,30 @@
$L__BB0_203:
bar.sync 0;
@%p53 bra $L__BB0_207;
$L__BB0_204:
- shr.u32 %r112, %r766, 1;
- setp.ge.u32 %p144, %r9, %r112;
+ shr.u32 %r111, %r760, 1;
+ setp.ge.u32 %p144, %r9, %r111;
@%p144 bra $L__BB0_206;
- mad.lo.s32 %r486, %r112, %r3, %r47;
- mul.wide.s32 %rd174, %r486, 4;
+ mad.lo.s32 %r485, %r111, %r3, %r46;
+ mul.wide.s32 %rd174, %r485, 4;
add.s64 %rd176, %rd44, %rd174;
ld.shared.f32 %f585, [%rd22];
ld.shared.f32 %f586, [%rd176];
add.f32 %f587, %f586, %f585;
st.shared.f32 [%rd22], %f587;
$L__BB0_206:
bar.sync 0;
- setp.gt.u32 %p145, %r766, 7;
- mov.u32 %r766, %r112;
+ setp.gt.u32 %p145, %r760, 7;
+ mov.u32 %r760, %r111;
@%p145 bra $L__BB0_204;
$L__BB0_207:
- mov.u32 %r767, 0;
+ mov.u32 %r761, 0;
@%p10 bra $L__BB0_211;
ld.shared.f32 %f588, [%rd22];
add.f32 %f758, %f588, 0f00000000;
setp.lt.u32 %p147, %r4, 2;
@@ -1780,420 +1778,416 @@
ld.shared.f32 %f589, [%rd23];
add.f32 %f758, %f758, %f589;
$L__BB0_210:
- mov.b32 %r767, %f758;
+ mov.b32 %r761, %f758;
$L__BB0_211:
bar.sync 0;
@%p1 bra $L__BB0_216;
bra.uni $L__BB0_212;
$L__BB0_216:
@%p10 bra $L__BB0_218;
- shl.b32 %r731, %r5, 3;
- mov.u32 %r512, %ctaid.y;
- mad.lo.s32 %r513, %r203, %r512, %r731;
- add.s32 %r514, %r513, %r82;
- mul.wide.s32 %rd183, %r514, 4;
+ mov.u32 %r511, %ctaid.y;
+ mad.lo.s32 %r512, %r202, %r511, %r8;
+ add.s32 %r513, %r512, %r81;
+ mul.wide.s32 %rd183, %r513, 4;
add.s64 %rd181, %rd40, %rd183;
- st.volatile.global.v4.s32 [%rd181], {%r737,%r739,%r741,%r743};
-
- add.s32 %r515, %r514, 4;
- mul.wide.s32 %rd184, %r515, 4;
+ st.volatile.global.v4.s32 [%rd181], {%r731,%r733,%r735,%r737};
+
+ add.s32 %r514, %r513, 4;
+ mul.wide.s32 %rd184, %r514, 4;
add.s64 %rd182, %rd40, %rd184;
- st.volatile.global.v4.s32 [%rd182], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd182], {%r739,%r741,%r743,%r745};
bra.uni $L__BB0_218;
$L__BB0_212:
- shl.b32 %r728, %r5, 3;
setp.eq.s32 %p148, %r9, 0;
and.pred %p4, %p148, %p8;
not.pred %p150, %p4;
- add.s32 %r488, %r728, 3;
- sub.s32 %r115, %r488, %r203;
- mov.u32 %r489, %ctaid.y;
- mad.lo.s32 %r116, %r203, %r489, %r728;
- neg.s32 %r490, %r82;
- setp.ge.s32 %p151, %r115, %r490;
+ add.s32 %r487, %r8, 3;
+ sub.s32 %r114, %r487, %r202;
+ mov.u32 %r488, %ctaid.y;
+ mad.lo.s32 %r115, %r202, %r488, %r8;
+ neg.s32 %r489, %r81;
+ setp.ge.s32 %p151, %r114, %r489;
or.pred %p152, %p150, %p151;
@%p152 bra $L__BB0_214;
- add.s32 %r495, %r116, %r82;
- mul.wide.s32 %rd178, %r495, 4;
+ add.s32 %r494, %r115, %r81;
+ mul.wide.s32 %rd178, %r494, 4;
add.s64 %rd177, %rd40, %rd178;
- st.volatile.global.v4.s32 [%rd177], {%r737,%r739,%r741,%r743};
+ st.volatile.global.v4.s32 [%rd177], {%r731,%r733,%r735,%r737};
$L__BB0_214:
- mov.u32 %r496, -4;
- sub.s32 %r497, %r496, %r82;
- setp.ge.s32 %p153, %r115, %r497;
+ mov.u32 %r495, -4;
+ sub.s32 %r496, %r495, %r81;
+ setp.ge.s32 %p153, %r114, %r496;
or.pred %p155, %p150, %p153;
@%p155 bra $L__BB0_218;
- add.s32 %r502, %r116, %r82;
- add.s32 %r503, %r502, 4;
- mul.wide.s32 %rd180, %r503, 4;
+ add.s32 %r501, %r115, %r81;
+ add.s32 %r502, %r501, 4;
+ mul.wide.s32 %rd180, %r502, 4;
add.s64 %rd179, %rd40, %rd180;
- st.volatile.global.v4.s32 [%rd179], {%r745,%r747,%r749,%r751};
+ st.volatile.global.v4.s32 [%rd179], {%r739,%r741,%r743,%r745};
$L__BB0_218:
@%p1 bra $L__BB0_225;
bra.uni $L__BB0_219;
$L__BB0_225:
@%p10 bra $L__BB0_227;
- shl.b32 %r730, %r5, 3;
- shl.b32 %r540, %r735, 5;
- mov.u32 %r541, %ctaid.y;
- mad.lo.s32 %r542, %r203, %r541, %r730;
- add.s32 %r543, %r542, %r540;
- mul.wide.s32 %rd191, %r543, 4;
+ shl.b32 %r539, %r729, 5;
+ mov.u32 %r540, %ctaid.y;
+ mad.lo.s32 %r541, %r202, %r540, %r8;
+ add.s32 %r542, %r541, %r539;
+ mul.wide.s32 %rd191, %r542, 4;
add.s64 %rd189, %rd41, %rd191;
- st.volatile.global.v4.s32 [%rd189], {%r753,%r755,%r757,%r759};
-
- add.s32 %r544, %r543, 4;
- mul.wide.s32 %rd192, %r544, 4;
+ st.volatile.global.v4.s32 [%rd189], {%r747,%r749,%r751,%r753};
+
+ add.s32 %r543, %r542, 4;
+ mul.wide.s32 %rd192, %r543, 4;
add.s64 %rd190, %rd41, %rd192;
- st.volatile.global.v4.s32 [%rd190], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd190], {%r755,%r757,%r759,%r761};
bra.uni $L__BB0_227;
$L__BB0_219:
- shl.b32 %r729, %r5, 3;
setp.eq.s32 %p157, %r9, 0;
and.pred %p5, %p157, %p8;
- add.s32 %r516, %r729, 3;
- sub.s32 %r117, %r516, %r203;
- mov.u32 %r517, %ctaid.y;
- mad.lo.s32 %r118, %r203, %r517, %r729;
+ add.s32 %r515, %r8, 3;
+ sub.s32 %r116, %r515, %r202;
+ mov.u32 %r516, %ctaid.y;
+ mad.lo.s32 %r117, %r202, %r516, %r8;
not.pred %p159, %p5;
@%p159 bra $L__BB0_222;
- shl.b32 %r119, %r735, 5;
- neg.s32 %r518, %r119;
- setp.ge.s32 %p160, %r117, %r518;
+ shl.b32 %r118, %r729, 5;
+ neg.s32 %r517, %r118;
+ setp.ge.s32 %p160, %r116, %r517;
@%p160 bra $L__BB0_222;
- add.s32 %r523, %r118, %r119;
- mul.wide.s32 %rd186, %r523, 4;
+ add.s32 %r522, %r117, %r118;
+ mul.wide.s32 %rd186, %r522, 4;
add.s64 %rd185, %rd41, %rd186;
- st.volatile.global.v4.s32 [%rd185], {%r753,%r755,%r757,%r759};
+ st.volatile.global.v4.s32 [%rd185], {%r747,%r749,%r751,%r753};
$L__BB0_222:
@%p159 bra $L__BB0_227;
- shl.b32 %r120, %r735, 5;
- mov.u32 %r524, -4;
- sub.s32 %r525, %r524, %r120;
- setp.ge.s32 %p162, %r117, %r525;
+ shl.b32 %r119, %r729, 5;
+ mov.u32 %r523, -4;
+ sub.s32 %r524, %r523, %r119;
+ setp.ge.s32 %p162, %r116, %r524;
@%p162 bra $L__BB0_227;
- add.s32 %r530, %r118, %r120;
- add.s32 %r531, %r530, 4;
- mul.wide.s32 %rd188, %r531, 4;
+ add.s32 %r529, %r117, %r119;
+ add.s32 %r530, %r529, 4;
+ mul.wide.s32 %rd188, %r530, 4;
add.s64 %rd187, %rd41, %rd188;
- st.volatile.global.v4.s32 [%rd187], {%r761,%r763,%r765,%r767};
+ st.volatile.global.v4.s32 [%rd187], {%r755,%r757,%r759,%r761};
$L__BB0_227:
- mov.u32 %r121, %ctaid.y;
+ mov.u32 %r120, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r545, %r5, %r9;
- or.b32 %r547, %r545, %r418;
- setp.ne.s32 %p164, %r547, 0;
+ or.b32 %r544, %r5, %r9;
+ or.b32 %r546, %r544, %r417;
+ setp.ne.s32 %p164, %r546, 0;
@%p164 bra $L__BB0_231;
ld.param.u64 %rd216, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd193, %rd216;
- mov.u32 %r548, %ctaid.x;
- mov.u32 %r549, %ctaid.z;
- mov.u32 %r550, %nctaid.x;
- mad.lo.s32 %r551, %r549, %r550, %r548;
- mul.wide.s32 %rd194, %r551, 8;
+ mov.u32 %r547, %ctaid.x;
+ mov.u32 %r548, %ctaid.z;
+ mov.u32 %r549, %nctaid.x;
+ mad.lo.s32 %r550, %r548, %r549, %r547;
+ mul.wide.s32 %rd194, %r550, 8;
add.s64 %rd26, %rd193, %rd194;
- add.s32 %r552, %r11, -1;
- setp.eq.s32 %p165, %r121, %r552;
+ add.s32 %r551, %r11, -1;
+ setp.eq.s32 %p165, %r120, %r551;
cvt.s64.s32 %rd195, %r11;
mov.u64 %rd196, -9223372036854775807;
sub.s64 %rd197, %rd196, %rd195;
selp.b64 %rd198, %rd197, 1, %p165;
atom.global.add.u64 %rd27, [%rd26], %rd198;
ld.volatile.global.u64 %rd199, [%rd26];
xor.b64 %rd200, %rd199, %rd27;
setp.lt.s64 %p166, %rd200, 0;
@%p166 bra $L__BB0_231;
- mov.u32 %r768, 8;
+ mov.u32 %r762, 8;
$L__BB0_230:
- nanosleep.u32 %r768;
-
- setp.lt.u32 %p167, %r768, 256;
- selp.u32 %r555, 1, 0, %p167;
- shl.b32 %r768, %r768, %r555;
+ nanosleep.u32 %r762;
+
+ setp.lt.u32 %p167, %r762, 256;
+ selp.u32 %r554, 1, 0, %p167;
+ shl.b32 %r762, %r762, %r554;
ld.volatile.global.u64 %rd201, [%rd26];
xor.b64 %rd202, %rd201, %rd27;
setp.gt.s64 %p168, %rd202, -1;
@%p168 bra $L__BB0_230;
$L__BB0_231:
ld.param.u64 %rd215, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd214, [_ZN11kernelscope6kernelENS_6TensorINS_8__bfloatELi2ELi2EEES2_NS0_IfLi2ELi2EEES3_NS0_IS1_Li1ELi1EEES4_S2_S4_S4_S3_S3_NS0_IxLi1ELi1EEE_param_7];
bar.sync 0;
- add.s32 %r557, %r203, 1;
- shr.u32 %r558, %r557, 31;
- add.s32 %r559, %r557, %r558;
- shr.s32 %r560, %r559, 1;
- add.s32 %r561, %r4, %r560;
- add.s32 %r562, %r561, -1;
- div.s32 %r563, %r562, %r4;
- add.s32 %r564, %r11, -1;
- add.s32 %r565, %r564, %r563;
- div.s32 %r124, %r565, %r11;
- add.s32 %r125, %r564, %r3;
- shl.b32 %r126, %r9, 1;
- shl.b32 %r566, %r4, 1;
- mad.lo.s32 %r129, %r566, %r121, %r126;
- or.b32 %r127, %r129, 1;
- mul.lo.s32 %r128, %r566, %r11;
- shr.u32 %r130, %r3, 5;
- mul.lo.s32 %r567, %r46, %r130;
- shr.u32 %r131, %r5, 5;
- add.s32 %r568, %r567, %r131;
- mul.wide.u32 %rd203, %r568, 4;
+ add.s32 %r556, %r202, 1;
+ shr.u32 %r557, %r556, 31;
+ add.s32 %r558, %r556, %r557;
+ shr.s32 %r559, %r558, 1;
+ add.s32 %r560, %r4, %r559;
+ add.s32 %r561, %r560, -1;
+ div.s32 %r562, %r561, %r4;
+ add.s32 %r563, %r11, -1;
+ add.s32 %r564, %r563, %r562;
+ div.s32 %r123, %r564, %r11;
+ add.s32 %r124, %r563, %r3;
+ shl.b32 %r125, %r9, 1;
+ shl.b32 %r565, %r4, 1;
+ mad.lo.s32 %r128, %r565, %r120, %r125;
+ or.b32 %r126, %r128, 1;
+ mul.lo.s32 %r127, %r565, %r11;
+ shr.u32 %r129, %r3, 5;
+ mul.lo.s32 %r566, %r45, %r129;
+ shr.u32 %r130, %r5, 5;
+ add.s32 %r567, %r566, %r130;
+ mul.wide.u32 %rd203, %r567, 4;
add.s64 %rd28, %rd44, %rd203;
- and.b32 %r132, %r5, 31;
- add.s32 %r569, %r567, %r132;
- mul.wide.u32 %rd205, %r569, 4;
+ and.b32 %r131, %r5, 31;
+ add.s32 %r568, %r566, %r131;
+ mul.wide.u32 %rd205, %r568, 4;
add.s64 %rd29, %rd44, %rd205;
cvta.to.global.u64 %rd30, %rd214;
cvta.to.global.u64 %rd31, %rd215;
- mov.u32 %r769, 0;
+ mov.u32 %r763, 0;
bra.uni $L__BB0_232;
$L__BB0_279:
- add.s32 %r769, %r769, 1;
+ add.s32 %r763, %r763, 1;
$L__BB0_232:
.pragma "nounroll";
- setp.lt.s32 %p169, %r769, %r124;
+ setp.lt.s32 %p169, %r763, %r123;
@%p169 bra $L__BB0_258;
bra.uni $L__BB0_233;
$L__BB0_258:
- div.s32 %r154, %r125, %r3;
- setp.lt.s32 %p206, %r154, 1;
+ div.s32 %r153, %r124, %r3;
+ setp.lt.s32 %p206, %r153, 1;
mov.f32 %f769, 0f00000000;
mov.f32 %f770, %f769;
@%p206 bra $L__BB0_264;
- mul.lo.s32 %r650, %r128, %r769;
- add.s32 %r155, %r127, %r650;
- add.s32 %r156, %r129, %r650;
- mov.u32 %r649, 0;
+ mul.lo.s32 %r649, %r127, %r763;
+ add.s32 %r154, %r126, %r649;
+ add.s32 %r155, %r128, %r649;
+ mov.u32 %r648, 0;
mov.f32 %f769, 0f00000000;
- mov.u32 %r776, %r649;
+ mov.u32 %r770, %r648;
$L__BB0_260:
.pragma "nounroll";
- setp.ge.s32 %p207, %r155, %r203;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ setp.ge.s32 %p207, %r154, %r202;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p207 bra $L__BB0_263;
- mad.lo.s32 %r158, %r776, %r3, %r5;
- setp.ge.s32 %p208, %r158, %r11;
- mov.u32 %r777, %r649;
- mov.u32 %r778, %r649;
+ mad.lo.s32 %r157, %r770, %r3, %r5;
+ setp.ge.s32 %p208, %r157, %r11;
+ mov.u32 %r771, %r648;
+ mov.u32 %r772, %r648;
@%p208 bra $L__BB0_263;
- mad.lo.s32 %r657, %r158, %r203, %r156;
- mul.wide.s32 %rd211, %r657, 4;
+ mad.lo.s32 %r656, %r157, %r202, %r155;
+ mul.wide.s32 %rd211, %r656, 4;
add.s64 %rd210, %rd40, %rd211;
- ld.volatile.global.v2.s32 {%r778,%r777}, [%rd210];
+ ld.volatile.global.v2.s32 {%r772,%r771}, [%rd210];
$L__BB0_263:
- mov.b32 %f642, %r778;
+ mov.b32 %f642, %r772;
add.f32 %f770, %f770, %f642;
- mov.b32 %f643, %r777;
+ mov.b32 %f643, %r771;
add.f32 %f769, %f769, %f643;
- add.s32 %r776, %r776, 1;
- setp.lt.s32 %p209, %r776, %r154;
+ add.s32 %r770, %r770, 1;
+ setp.lt.s32 %p209, %r770, %r153;
@%p209 bra $L__BB0_260;
$L__BB0_264:
- mov.b32 %r658, %f770;
- mov.u32 %r659, 31;
- mov.u32 %r660, 16;
- mov.u32 %r661, -1;
- shfl.sync.bfly.b32 %r662|%p210, %r658, %r660, %r659, %r661;
- mov.b32 %f644, %r662;
+ mov.b32 %r657, %f770;
+ mov.u32 %r658, 31;
+ mov.u32 %r659, 16;
+ mov.u32 %r660, -1;
+ shfl.sync.bfly.b32 %r661|%p210, %r657, %r659, %r658, %r660;
+ mov.b32 %f644, %r661;
add.f32 %f645, %f770, %f644;
- mov.b32 %r663, %f645;
- mov.u32 %r664, 8;
- shfl.sync.bfly.b32 %r665|%p211, %r663, %r664, %r659, %r661;
- mov.b32 %f646, %r665;
+ mov.b32 %r662, %f645;
+ mov.u32 %r663, 8;
+ shfl.sync.bfly.b32 %r664|%p211, %r662, %r663, %r658, %r660;
+ mov.b32 %f646, %r664;
add.f32 %f647, %f645, %f646;
- mov.b32 %r666, %f647;
- mov.u32 %r667, 4;
- shfl.sync.bfly.b32 %r668|%p212, %r666, %r667, %r659, %r661;
- mov.b32 %f648, %r668;
+ mov.b32 %r665, %f647;
+ mov.u32 %r666, 4;
+ shfl.sync.bfly.b32 %r667|%p212, %r665, %r666, %r658, %r660;
+ mov.b32 %f648, %r667;
add.f32 %f649, %f647, %f648;
- mov.b32 %r669, %f649;
- mov.u32 %r670, 2;
- shfl.sync.bfly.b32 %r671|%p213, %r669, %r670, %r659, %r661;
- mov.b32 %f650, %r671;
+ mov.b32 %r668, %f649;
+ mov.u32 %r669, 2;
+ shfl.sync.bfly.b32 %r670|%p213, %r668, %r669, %r658, %r660;
+ mov.b32 %f650, %r670;
add.f32 %f651, %f649, %f650;
- mov.b32 %r672, %f651;
- mov.u32 %r673, 1;
- shfl.sync.bfly.b32 %r674|%p214, %r672, %r673, %r659, %r661;
- mov.b32 %f652, %r674;
+ mov.b32 %r671, %f651;
+ mov.u32 %r672, 1;
+ shfl.sync.bfly.b32 %r673|%p214, %r671, %r672, %r658, %r660;
+ mov.b32 %f652, %r673;
add.f32 %f772, %f651, %f652;
bar.sync 0;
- setp.ne.s32 %p215, %r132, 0;
+ setp.ne.s32 %p215, %r131, 0;
@%p215 bra $L__BB0_266;
st.shared.f32 [%rd28], %f772;
$L__BB0_266:
- setp.ne.s32 %p216, %r131, 0;
+ setp.ne.s32 %p216, %r130, 0;
bar.sync 0;
@%p216 bra $L__BB0_270;
- setp.ge.u32 %p217, %r132, %r130;
+ setp.ge.u32 %p217, %r131, %r129;
mov.f32 %f771, 0f00000000;
@%p217 bra $L__BB0_269;
ld.shared.f32 %f771, [%rd29];
$L__BB0_269:
- mov.b32 %r675, %f771;
- mov.u32 %r676, 31;
- mov.u32 %r677, 16;
- mov.u32 %r678, -1;
- shfl.sync.bfly.b32 %r679|%p218, %r675, %r677, %r676, %r678;
- mov.b32 %f654, %r679;
+ mov.b32 %r674, %f771;
+ mov.u32 %r675, 31;
+ mov.u32 %r676, 16;
+ mov.u32 %r677, -1;
+ shfl.sync.bfly.b32 %r678|%p218, %r674, %r676, %r675, %r677;
+ mov.b32 %f654, %r678;
add.f32 %f655, %f771, %f654;
- mov.b32 %r680, %f655;
- mov.u32 %r681, 8;
- shfl.sync.bfly.b32 %r682|%p219, %r680, %r681, %r676, %r678;
- mov.b32 %f656, %r682;
+ mov.b32 %r679, %f655;
+ mov.u32 %r680, 8;
+ shfl.sync.bfly.b32 %r681|%p219, %r679, %r680, %r675, %r677;
+ mov.b32 %f656, %r681;
add.f32 %f657, %f655, %f656;
- mov.b32 %r683, %f657;
- mov.u32 %r684, 4;
- shfl.sync.bfly.b32 %r685|%p220, %r683, %r684, %r676, %r678;
- mov.b32 %f658, %r685;
+ mov.b32 %r682, %f657;
+ mov.u32 %r683, 4;
+ shfl.sync.bfly.b32 %r684|%p220, %r682, %r683, %r675, %r677;
+ mov.b32 %f658, %r684;
add.f32 %f659, %f657, %f658;
- mov.b32 %r686, %f659;
- mov.u32 %r687, 2;
- shfl.sync.bfly.b32 %r688|%p221, %r686, %r687, %r676, %r678;
- mov.b32 %f660, %r688;
+ mov.b32 %r685, %f659;
+ mov.u32 %r686, 2;
+ shfl.sync.bfly.b32 %r687|%p221, %r685, %r686, %r675, %r677;
+ mov.b32 %f660, %r687;
add.f32 %f661, %f659, %f660;
- mov.b32 %r689, %f661;
- mov.u32 %r690, 1;
- shfl.sync.bfly.b32 %r691|%p222, %r689, %r690, %r676, %r678;
- mov.b32 %f662, %r691;
+ mov.b32 %r688, %f661;
+ mov.u32 %r689, 1;
+ shfl.sync.bfly.b32 %r690|%p222, %r688, %r689, %r675, %r677;
+ mov.b32 %f662, %r690;
add.f32 %f772, %f661, %f662;
$L__BB0_270:
add.f32 %f664, %f772, 0f00000000;
- setp.eq.s32 %p224, %r132, 0;
+ setp.eq.s32 %p224, %r131, 0;
selp.f32 %f663, %f664, 0f00000000, %p224;
bar.sync 0;
{ cvt.rn.bf16.f32 %rs131, %f663;}
- mov.b32 %r692, %f769;
- mov.u32 %r693, 31;
- mov.u32 %r694, 16;
- mov.u32 %r695, -1;
- shfl.sync.bfly.b32 %r696|%p225, %r692, %r694, %r693, %r695;
- mov.b32 %f665, %r696;
+ mov.b32 %r691, %f769;
+ mov.u32 %r692, 31;
+ mov.u32 %r693, 16;
+ mov.u32 %r694, -1;
+ shfl.sync.bfly.b32 %r695|%p225, %r691, %r693, %r692, %r694;
+ mov.b32 %f665, %r695;
add.f32 %f666, %f769, %f665;
- mov.b32 %r697, %f666;
- mov.u32 %r698, 8;
- shfl.sync.bfly.b32 %r699|%p226, %r697, %r698, %r693, %r695;
- mov.b32 %f667, %r699;
+ mov.b32 %r696, %f666;
+ mov.u32 %r697, 8;
+ shfl.sync.bfly.b32 %r698|%p226, %r696, %r697, %r692, %r694;
+ mov.b32 %f667, %r698;
add.f32 %f668, %f666, %f667;
- mov.b32 %r700, %f668;
- mov.u32 %r701, 4;
- shfl.sync.bfly.b32 %r702|%p227, %r700, %r701, %r693, %r695;
- mov.b32 %f669, %r702;
+ mov.b32 %r699, %f668;
+ mov.u32 %r700, 4;
+ shfl.sync.bfly.b32 %r701|%p227, %r699, %r700, %r692, %r694;
+ mov.b32 %f669, %r701;
add.f32 %f670, %f668, %f669;
- mov.b32 %r703, %f670;
- mov.u32 %r704, 2;
- shfl.sync.bfly.b32 %r705|%p228, %r703, %r704, %r693, %r695;
- mov.b32 %f671, %r705;
+ mov.b32 %r702, %f670;
+ mov.u32 %r703, 2;
+ shfl.sync.bfly.b32 %r704|%p228, %r702, %r703, %r692, %r694;
+ mov.b32 %f671, %r704;
add.f32 %f672, %f670, %f671;
- mov.b32 %r706, %f672;
- mov.u32 %r707, 1;
- shfl.sync.bfly.b32 %r708|%p229, %r706, %r707, %r693, %r695;
- mov.b32 %f673, %r708;
+ mov.b32 %r705, %f672;
+ mov.u32 %r706, 1;
+ shfl.sync.bfly.b32 %r707|%p229, %r705, %r706, %r692, %r694;
+ mov.b32 %f673, %r707;
add.f32 %f774, %f672, %f673;
bar.sync 0;
@%p215 bra $L__BB0_272;
st.shared.f32 [%rd28], %f774;
$L__BB0_272:
bar.sync 0;
@%p216 bra $L__BB0_276;
- setp.ge.u32 %p231, %r132, %r130;
+ setp.ge.u32 %p231, %r131, %r129;
mov.f32 %f773, 0f00000000;
@%p231 bra $L__BB0_275;
ld.shared.f32 %f773, [%rd29];
$L__BB0_275:
- mov.b32 %r709, %f773;
- mov.u32 %r710, 31;
- mov.u32 %r711, 16;
- mov.u32 %r712, -1;
- shfl.sync.bfly.b32 %r713|%p232, %r709, %r711, %r710, %r712;
- mov.b32 %f675, %r713;
+ mov.b32 %r708, %f773;
+ mov.u32 %r709, 31;
+ mov.u32 %r710, 16;
+ mov.u32 %r711, -1;
+ shfl.sync.bfly.b32 %r712|%p232, %r708, %r710, %r709, %r711;
+ mov.b32 %f675, %r712;
add.f32 %f676, %f773, %f675;
- mov.b32 %r714, %f676;
- mov.u32 %r715, 8;
- shfl.sync.bfly.b32 %r716|%p233, %r714, %r715, %r710, %r712;
- mov.b32 %f677, %r716;
+ mov.b32 %r713, %f676;
+ mov.u32 %r714, 8;
+ shfl.sync.bfly.b32 %r715|%p233, %r713, %r714, %r709, %r711;
+ mov.b32 %f677, %r715;
add.f32 %f678, %f676, %f677;
- mov.b32 %r717, %f678;
- mov.u32 %r718, 4;
- shfl.sync.bfly.b32 %r719|%p234, %r717, %r718, %r710, %r712;
- mov.b32 %f679, %r719;
+ mov.b32 %r716, %f678;
+ mov.u32 %r717, 4;
+ shfl.sync.bfly.b32 %r718|%p234, %r716, %r717, %r709, %r711;
+ mov.b32 %f679, %r718;
add.f32 %f680, %f678, %f679;
- mov.b32 %r720, %f680;
- mov.u32 %r721, 2;
- shfl.sync.bfly.b32 %r722|%p235, %r720, %r721, %r710, %r712;
- mov.b32 %f681, %r722;
+ mov.b32 %r719, %f680;
+ mov.u32 %r720, 2;
+ shfl.sync.bfly.b32 %r721|%p235, %r719, %r720, %r709, %r711;
+ mov.b32 %f681, %r721;
add.f32 %f682, %f680, %f681;
- mov.b32 %r723, %f682;
- mov.u32 %r724, 1;
- shfl.sync.bfly.b32 %r725|%p236, %r723, %r724, %r710, %r712;
- mov.b32 %f683, %r725;
+ mov.b32 %r722, %f682;
+ mov.u32 %r723, 1;
+ shfl.sync.bfly.b32 %r724|%p236, %r722, %r723, %r709, %r711;
+ mov.b32 %f683, %r724;
add.f32 %f774, %f682, %f683;
$L__BB0_276:
add.f32 %f685, %f774, 0f00000000;
selp.f32 %f684, %f685, 0f00000000, %p224;
@@ -2202,228 +2196,228 @@
{ cvt.rn.bf16.f32 %rs132, %f684;}
@%p6 bra $L__BB0_279;
- mul.lo.s32 %r164, %r128, %r769;
- add.s32 %r726, %r127, %r164;
- setp.ge.s32 %p239, %r726, %r203;
+ mul.lo.s32 %r163, %r127, %r763;
+ add.s32 %r725, %r126, %r163;
+ setp.ge.s32 %p239, %r725, %r202;
@%p239 bra $L__BB0_279;
- add.s32 %r727, %r129, %r164;
- mul.wide.s32 %rd212, %r727, 2;
+ add.s32 %r726, %r128, %r163;
+ mul.wide.s32 %rd212, %r726, 2;
add.s64 %rd213, %rd30, %rd212;
st.global.v2.u16 [%rd213], {%rs131, %rs132};
bra.uni $L__BB0_279;
$L__BB0_233:
- setp.lt.s32 %p170, %r124, 1;
+ setp.lt.s32 %p170, %r123, 1;
@%p170 bra $L__BB0_257;
- div.s32 %r134, %r125, %r3;
- mad.lo.s32 %r135, %r203, %r5, %r126;
- shl.b32 %r136, %r121, 1;
- shl.b32 %r137, %r11, 1;
- mul.lo.s32 %r138, %r203, %r3;
- mov.u32 %r770, 0;
+ div.s32 %r133, %r124, %r3;
+ mad.lo.s32 %r134, %r202, %r5, %r125;
+ shl.b32 %r135, %r120, 1;
+ shl.b32 %r136, %r11, 1;
+ mul.lo.s32 %r137, %r202, %r3;
+ mov.u32 %r764, 0;
$L__BB0_235:
.pragma "nounroll";
- setp.lt.s32 %p171, %r134, 1;
+ setp.lt.s32 %p171, %r133, 1;
mov.f32 %f761, 0f00000000;
mov.f32 %f762, %f761;
@%p171 bra $L__BB0_241;
- mad.lo.s32 %r140, %r128, %r770, %r127;
- mad.lo.s32 %r572, %r137, %r770, %r136;
- mad.lo.s32 %r772, %r4, %r572, %r135;
- mov.u32 %r571, 0;
+ mad.lo.s32 %r139, %r127, %r764, %r126;
+ mad.lo.s32 %r571, %r136, %r764, %r135;
+ mad.lo.s32 %r766, %r4, %r571, %r134;
+ mov.u32 %r570, 0;
mov.f32 %f761, 0f00000000;
- mov.u32 %r771, %r5;
- mov.u32 %r773, %r571;
+ mov.u32 %r765, %r5;
+ mov.u32 %r767, %r570;
$L__BB0_237:
.pragma "nounroll";
- setp.ge.s32 %p172, %r140, %r203;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p172, %r139, %r202;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p172 bra $L__BB0_240;
- setp.ge.s32 %p173, %r771, %r11;
- mov.u32 %r774, %r571;
- mov.u32 %r775, %r571;
+ setp.ge.s32 %p173, %r765, %r11;
+ mov.u32 %r768, %r570;
+ mov.u32 %r769, %r570;
@%p173 bra $L__BB0_240;
- mul.wide.s32 %rd207, %r772, 4;
+ mul.wide.s32 %rd207, %r766, 4;
add.s64 %rd206, %rd41, %rd207;
- ld.volatile.global.v2.s32 {%r775,%r774}, [%rd206];
+ ld.volatile.global.v2.s32 {%r769,%r768}, [%rd206];
$L__BB0_240:
- mov.b32 %f594, %r775;
+ mov.b32 %f594, %r769;
add.f32 %f762, %f762, %f594;
- mov.b32 %f595, %r774;
+ mov.b32 %f595, %r768;
add.f32 %f761, %f761, %f595;
- add.s32 %r772, %r772, %r138;
- add.s32 %r771, %r771, %r3;
- add.s32 %r773, %r773, 1;
- setp.lt.s32 %p174, %r773, %r134;
+ add.s32 %r766, %r766, %r137;
+ add.s32 %r765, %r765, %r3;
+ add.s32 %r767, %r767, 1;
+ setp.lt.s32 %p174, %r767, %r133;
@%p174 bra $L__BB0_237;
$L__BB0_241:
- mov.b32 %r579, %f762;
- mov.u32 %r580, 31;
- mov.u32 %r581, 16;
- mov.u32 %r582, -1;
- shfl.sync.bfly.b32 %r583|%p175, %r579, %r581, %r580, %r582;
- mov.b32 %f596, %r583;
+ mov.b32 %r578, %f762;
+ mov.u32 %r579, 31;
+ mov.u32 %r580, 16;
+ mov.u32 %r581, -1;
+ shfl.sync.bfly.b32 %r582|%p175, %r578, %r580, %r579, %r581;
+ mov.b32 %f596, %r582;
add.f32 %f597, %f762, %f596;
- mov.b32 %r584, %f597;
- mov.u32 %r585, 8;
- shfl.sync.bfly.b32 %r586|%p176, %r584, %r585, %r580, %r582;
- mov.b32 %f598, %r586;
+ mov.b32 %r583, %f597;
+ mov.u32 %r584, 8;
+ shfl.sync.bfly.b32 %r585|%p176, %r583, %r584, %r579, %r581;
+ mov.b32 %f598, %r585;
add.f32 %f599, %f597, %f598;
- mov.b32 %r587, %f599;
- mov.u32 %r588, 4;
- shfl.sync.bfly.b32 %r589|%p177, %r587, %r588, %r580, %r582;
- mov.b32 %f600, %r589;
+ mov.b32 %r586, %f599;
+ mov.u32 %r587, 4;
+ shfl.sync.bfly.b32 %r588|%p177, %r586, %r587, %r579, %r581;
+ mov.b32 %f600, %r588;
add.f32 %f601, %f599, %f600;
- mov.b32 %r590, %f601;
- mov.u32 %r591, 2;
- shfl.sync.bfly.b32 %r592|%p178, %r590, %r591, %r580, %r582;
- mov.b32 %f602, %r592;
+ mov.b32 %r589, %f601;
+ mov.u32 %r590, 2;
+ shfl.sync.bfly.b32 %r591|%p178, %r589, %r590, %r579, %r581;
+ mov.b32 %f602, %r591;
add.f32 %f603, %f601, %f602;
- mov.b32 %r593, %f603;
- mov.u32 %r594, 1;
- shfl.sync.bfly.b32 %r595|%p179, %r593, %r594, %r580, %r582;
- mov.b32 %f604, %r595;
+ mov.b32 %r592, %f603;
+ mov.u32 %r593, 1;
+ shfl.sync.bfly.b32 %r594|%p179, %r592, %r593, %r579, %r581;
+ mov.b32 %f604, %r594;
add.f32 %f764, %f603, %f604;
bar.sync 0;
- setp.ne.s32 %p180, %r132, 0;
+ setp.ne.s32 %p180, %r131, 0;
@%p180 bra $L__BB0_243;
st.shared.f32 [%rd28], %f764;
$L__BB0_243:
- setp.ne.s32 %p181, %r131, 0;
+ setp.ne.s32 %p181, %r130, 0;
bar.sync 0;
@%p181 bra $L__BB0_247;
- setp.ge.u32 %p182, %r132, %r130;
+ setp.ge.u32 %p182, %r131, %r129;
mov.f32 %f763, 0f00000000;
@%p182 bra $L__BB0_246;
ld.shared.f32 %f763, [%rd29];
$L__BB0_246:
- mov.b32 %r596, %f763;
- mov.u32 %r597, 31;
- mov.u32 %r598, 16;
- mov.u32 %r599, -1;
- shfl.sync.bfly.b32 %r600|%p183, %r596, %r598, %r597, %r599;
- mov.b32 %f606, %r600;
+ mov.b32 %r595, %f763;
+ mov.u32 %r596, 31;
+ mov.u32 %r597, 16;
+ mov.u32 %r598, -1;
+ shfl.sync.bfly.b32 %r599|%p183, %r595, %r597, %r596, %r598;
+ mov.b32 %f606, %r599;
add.f32 %f607, %f763, %f606;
- mov.b32 %r601, %f607;
- mov.u32 %r602, 8;
- shfl.sync.bfly.b32 %r603|%p184, %r601, %r602, %r597, %r599;
- mov.b32 %f608, %r603;
+ mov.b32 %r600, %f607;
+ mov.u32 %r601, 8;
+ shfl.sync.bfly.b32 %r602|%p184, %r600, %r601, %r596, %r598;
+ mov.b32 %f608, %r602;
add.f32 %f609, %f607, %f608;
- mov.b32 %r604, %f609;
- mov.u32 %r605, 4;
- shfl.sync.bfly.b32 %r606|%p185, %r604, %r605, %r597, %r599;
- mov.b32 %f610, %r606;
+ mov.b32 %r603, %f609;
+ mov.u32 %r604, 4;
+ shfl.sync.bfly.b32 %r605|%p185, %r603, %r604, %r596, %r598;
+ mov.b32 %f610, %r605;
add.f32 %f611, %f609, %f610;
- mov.b32 %r607, %f611;
- mov.u32 %r608, 2;
- shfl.sync.bfly.b32 %r609|%p186, %r607, %r608, %r597, %r599;
- mov.b32 %f612, %r609;
+ mov.b32 %r606, %f611;
+ mov.u32 %r607, 2;
+ shfl.sync.bfly.b32 %r608|%p186, %r606, %r607, %r596, %r598;
+ mov.b32 %f612, %r608;
add.f32 %f613, %f611, %f612;
- mov.b32 %r610, %f613;
- mov.u32 %r611, 1;
- shfl.sync.bfly.b32 %r612|%p187, %r610, %r611, %r597, %r599;
- mov.b32 %f614, %r612;
+ mov.b32 %r609, %f613;
+ mov.u32 %r610, 1;
+ shfl.sync.bfly.b32 %r611|%p187, %r609, %r610, %r596, %r598;
+ mov.b32 %f614, %r611;
add.f32 %f764, %f613, %f614;
$L__BB0_247:
add.f32 %f616, %f764, 0f00000000;
- setp.eq.s32 %p189, %r132, 0;
+ setp.eq.s32 %p189, %r131, 0;
selp.f32 %f615, %f616, 0f00000000, %p189;
bar.sync 0;
{ cvt.rn.bf16.f32 %rs129, %f615;}
- mov.b32 %r613, %f761;
- mov.u32 %r614, 31;
- mov.u32 %r615, 16;
- mov.u32 %r616, -1;
- shfl.sync.bfly.b32 %r617|%p190, %r613, %r615, %r614, %r616;
- mov.b32 %f617, %r617;
+ mov.b32 %r612, %f761;
+ mov.u32 %r613, 31;
+ mov.u32 %r614, 16;
+ mov.u32 %r615, -1;
+ shfl.sync.bfly.b32 %r616|%p190, %r612, %r614, %r613, %r615;
+ mov.b32 %f617, %r616;
add.f32 %f618, %f761, %f617;
- mov.b32 %r618, %f618;
- mov.u32 %r619, 8;
- shfl.sync.bfly.b32 %r620|%p191, %r618, %r619, %r614, %r616;
- mov.b32 %f619, %r620;
+ mov.b32 %r617, %f618;
+ mov.u32 %r618, 8;
+ shfl.sync.bfly.b32 %r619|%p191, %r617, %r618, %r613, %r615;
+ mov.b32 %f619, %r619;
add.f32 %f620, %f618, %f619;
- mov.b32 %r621, %f620;
- mov.u32 %r622, 4;
- shfl.sync.bfly.b32 %r623|%p192, %r621, %r622, %r614, %r616;
- mov.b32 %f621, %r623;
+ mov.b32 %r620, %f620;
+ mov.u32 %r621, 4;
+ shfl.sync.bfly.b32 %r622|%p192, %r620, %r621, %r613, %r615;
+ mov.b32 %f621, %r622;
add.f32 %f622, %f620, %f621;
- mov.b32 %r624, %f622;
- mov.u32 %r625, 2;
- shfl.sync.bfly.b32 %r626|%p193, %r624, %r625, %r614, %r616;
- mov.b32 %f623, %r626;
+ mov.b32 %r623, %f622;
+ mov.u32 %r624, 2;
+ shfl.sync.bfly.b32 %r625|%p193, %r623, %r624, %r613, %r615;
+ mov.b32 %f623, %r625;
add.f32 %f624, %f622, %f623;
- mov.b32 %r627, %f624;
- mov.u32 %r628, 1;
- shfl.sync.bfly.b32 %r629|%p194, %r627, %r628, %r614, %r616;
- mov.b32 %f625, %r629;
+ mov.b32 %r626, %f624;
+ mov.u32 %r627, 1;
+ shfl.sync.bfly.b32 %r628|%p194, %r626, %r627, %r613, %r615;
+ mov.b32 %f625, %r628;
add.f32 %f766, %f624, %f625;
bar.sync 0;
@%p180 bra $L__BB0_249;
st.shared.f32 [%rd28], %f766;
$L__BB0_249:
bar.sync 0;
@%p181 bra $L__BB0_253;
- setp.ge.u32 %p196, %r132, %r130;
+ setp.ge.u32 %p196, %r131, %r129;
mov.f32 %f765, 0f00000000;
@%p196 bra $L__BB0_252;
ld.shared.f32 %f765, [%rd29];
$L__BB0_252:
- mov.b32 %r630, %f765;
- mov.u32 %r631, 31;
- mov.u32 %r632, 16;
- mov.u32 %r633, -1;
- shfl.sync.bfly.b32 %r634|%p197, %r630, %r632, %r631, %r633;
- mov.b32 %f627, %r634;
+ mov.b32 %r629, %f765;
+ mov.u32 %r630, 31;
+ mov.u32 %r631, 16;
+ mov.u32 %r632, -1;
+ shfl.sync.bfly.b32 %r633|%p197, %r629, %r631, %r630, %r632;
+ mov.b32 %f627, %r633;
add.f32 %f628, %f765, %f627;
- mov.b32 %r635, %f628;
- mov.u32 %r636, 8;
- shfl.sync.bfly.b32 %r637|%p198, %r635, %r636, %r631, %r633;
- mov.b32 %f629, %r637;
+ mov.b32 %r634, %f628;
+ mov.u32 %r635, 8;
+ shfl.sync.bfly.b32 %r636|%p198, %r634, %r635, %r630, %r632;
+ mov.b32 %f629, %r636;
add.f32 %f630, %f628, %f629;
- mov.b32 %r638, %f630;
- mov.u32 %r639, 4;
- shfl.sync.bfly.b32 %r640|%p199, %r638, %r639, %r631, %r633;
- mov.b32 %f631, %r640;
+ mov.b32 %r637, %f630;
+ mov.u32 %r638, 4;
+ shfl.sync.bfly.b32 %r639|%p199, %r637, %r638, %r630, %r632;
+ mov.b32 %f631, %r639;
add.f32 %f632, %f630, %f631;
- mov.b32 %r641, %f632;
- mov.u32 %r642, 2;
- shfl.sync.bfly.b32 %r643|%p200, %r641, %r642, %r631, %r633;
- mov.b32 %f633, %r643;
+ mov.b32 %r640, %f632;
+ mov.u32 %r641, 2;
+ shfl.sync.bfly.b32 %r642|%p200, %r640, %r641, %r630, %r632;
+ mov.b32 %f633, %r642;
add.f32 %f634, %f632, %f633;
- mov.b32 %r644, %f634;
- mov.u32 %r645, 1;
- shfl.sync.bfly.b32 %r646|%p201, %r644, %r645, %r631, %r633;
- mov.b32 %f635, %r646;
+ mov.b32 %r643, %f634;
+ mov.u32 %r644, 1;
+ shfl.sync.bfly.b32 %r645|%p201, %r643, %r644, %r630, %r632;
+ mov.b32 %f635, %r645;
add.f32 %f766, %f634, %f635;
$L__BB0_253:
add.f32 %f637, %f766, 0f00000000;
selp.f32 %f636, %f637, 0f00000000, %p189;
@@ -2432,23 +2426,23 @@
{ cvt.rn.bf16.f32 %rs130, %f636;}
@%p6 bra $L__BB0_256;
- mul.lo.s32 %r152, %r128, %r770;
- add.s32 %r647, %r127, %r152;
- setp.ge.s32 %p204, %r647, %r203;
+ mul.lo.s32 %r151, %r127, %r764;
+ add.s32 %r646, %r126, %r151;
+ setp.ge.s32 %p204, %r646, %r202;
@%p204 bra $L__BB0_256;
- add.s32 %r648, %r129, %r152;
- mul.wide.s32 %rd208, %r648, 2;
+ add.s32 %r647, %r128, %r151;
+ mul.wide.s32 %rd208, %r647, 2;
add.s64 %rd209, %rd31, %rd208;
st.global.v2.u16 [%rd209], {%rs129, %rs130};
$L__BB0_256:
- add.s32 %r770, %r770, 1;
- setp.lt.s32 %p205, %r770, %r124;
+ add.s32 %r764, %r764, 1;
+ setp.lt.s32 %p205, %r764, %r123;
@%p205 bra $L__BB0_235;
$L__BB0_257:
ret;
24: CombinedSchedulerTest.LayerNormBackward/dtype___bfloat_batch_216_hidden_65536
Kernel 3
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-9
+9 index type: int
registers: 40
gmem: 3
static smem: 16
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T18, Tensor<float, 1, 1> T15, Tensor<float, 2, 2> T14, Tensor<float, 2, 2> T11, Tensor<__bfloat, 2, 2> T28) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
float* T34 = reinterpret_cast<float*>(array + smem_offset + 4608);
float* T35 = reinterpret_cast<float*>(array + smem_offset + 4096);
float* T33 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T11.data;
s0.logical_size = T11.logical_size;
s0.alloc_stride = T11.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 8) * 4) + 3) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T33[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__bfloat, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
= T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
T23[0]
= (float) d5
* T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2bfloat(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
loadLocalToGlobal<__bfloat, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T33[(((nvfuser_index_t)threadIdx.x) % 32)]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__bfloat, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T16[0]
= T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T23[0]
= (float) d5
* T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
}
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2bfloat(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T18, Tensor<float, 1, 1> T15, Tensor<float, 2, 2> T14, Tensor<float, 2, 2> T11, Tensor<__bfloat, 2, 2> T28) {
alignas(16) extern __shared__ char array[];
const unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO;
float* T34 = reinterpret_cast<float*>(array + smem_offset + 4608);
float* T35 = reinterpret_cast<float*>(array + smem_offset + 4096);
float* T33 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T11.data;
s0.logical_size = T11.logical_size;
s0.alloc_stride = T11.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__bfloat, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
= T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
= T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
T23[0]
= (float) d5
* T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2bfloat(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
loadLocalToGlobal<__bfloat, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
} else {
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
T37.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i6 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T37[(4 * i6)], &T11[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i6 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
if (((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T35[((nvfuser_index_t)threadIdx.x)]
= T15[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
Array<float, 8, 4> T36;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T36.set(float(0));
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i7 + nvfuser_zero)))))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__bfloat, 8, 4> T38;
__barrier_sync(0);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 2; ++i9) {
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T16[0]
= T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
= T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
T22[0]
= T21[0]
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T23[0]
= (float) d5
* T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
}
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
= __float2bfloat(T24[0]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 2; ++i11) {
if (((((3 + (4 * (((nvfuser_index_t)threadIdx.x) % 8))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2) && (((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i11 + nvfuser_zero)))))) {
loadLocalToGlobal<__bfloat, /*vec_size=*/4, /*is_volatile=*/false>( &T28[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i11 + nvfuser_zero)))], &T38[(4 * i11)]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -17,11 +17,11 @@
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
- if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((((((nvfuser_index_t)threadIdx.x) % 8) * 4) + 3) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
+ if ((((((((nvfuser_index_t)threadIdx.x) < 32) && ((((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && (((16 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) && ((31 + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) < i2)) && (((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216))) {
T34[((nvfuser_index_t)threadIdx.x)]
= T18[(((nvfuser_index_t)threadIdx.x) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
Array<float, 8, 4> T37;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 2; ++i6) {
@@ -46,11 +46,11 @@
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T36[(4 * i7)], &T14[(((((4 * (((nvfuser_index_t)threadIdx.x) % 8)) + (i2 * (((nvfuser_index_t)threadIdx.x) / 8))) + ((32 * i2) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) + (32 * (((nvfuser_index_t)blockIdx.x) % (ceilDiv(i2, 32))))) + ((16 * i2) * (i7 + nvfuser_zero)))]);
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
- T33[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__bfloat, 8, 4> T38;
__barrier_sync(0);
@@ -59,19 +59,19 @@
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
T16[0]
- = T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
T19[0]
- = T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
Array<float, 1, 1> T22;
@@ -80,11 +80,11 @@
- T20[0];
Array<float, 1, 1> T23;
T23[0] = 0;
T23[0]
= (float) d5
- * T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ * T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
T38[((4 * i9) + i10)]
@@ -133,11 +133,11 @@
}
NVFUSER_UPDATE_MAGIC_ZERO;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 8; ++i8) {
if ((((((nvfuser_index_t)threadIdx.x) % 32) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < 216)) {
- T33[(((nvfuser_index_t)threadIdx.x) % 32)]
+ T33[(((nvfuser_index_t)threadIdx.x) + (128 * i8))]
= T3[((T3.alloc_stride[0LL] * (((nvfuser_index_t)threadIdx.x) % 32)) + ((32 * T3.alloc_stride[0LL]) * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32)))))];
}
}
NVFUSER_UPDATE_MAGIC_ZERO;
Array<__bfloat, 8, 4> T38;
@@ -148,21 +148,21 @@
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T16;
T16[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T16[0]
- = T35[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T35[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T21;
T21[0]
= T36[((4 * i9) + i10)]
- T16[0];
Array<float, 1, 1> T19;
T19[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T19[0]
- = T34[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ = T34[(((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9)) % 128)];
}
Array<float, 1, 1> T20;
T20[0]
= T37[((4 * i9) + i10)]
* T19[0];
@@ -173,11 +173,11 @@
Array<float, 1, 1> T23;
T23[0] = 0;
if ((((-216 + (((nvfuser_index_t)threadIdx.x) / 8)) + (32 * (((nvfuser_index_t)blockIdx.x) / (ceilDiv(i2, 32))))) < (-(16 * (i9 + nvfuser_zero))))) {
T23[0]
= (float) d5
- * T33[((((nvfuser_index_t)threadIdx.x) / 8) + (16 * i9))];
+ * T33[((((128 * (((nvfuser_index_t)threadIdx.x) % 8)) + (((nvfuser_index_t)threadIdx.x) / 8)) + (16 * i9)) + (32 * i10))];
}
Array<float, 1, 1> T24;
T24[0]
= T23[0]
* T22[0];
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_1[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_2[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5[24]
)
{
.reg .pred %p<48>;
.reg .b16 %rs<29>;
.reg .f32 %f<257>;
.reg .b32 %r<243>;
.reg .f64 %fd<3>;
.reg .b64 %rd<61>;
// demoted variable
.shared .align 4 .u32 _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0+16];
ld.param.v2.u32 {%r66, %r67}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4+8];
ld.param.u64 %rd13, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5];
ld.param.u64 %rd12, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_1];
ld.param.u64 %rd11, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0];
ld.param.u64 %rd3, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_3];
ld.param.u64 %rd14, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_2];
cvta.to.global.u64 %rd1, %rd11;
cvta.to.global.u64 %rd2, %rd14;
ld.param.u64 %rd4, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd15, _ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r73, [%rd15], %r2;
ld.shared.u32 %r4, [_ZZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_1033910nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r67;
rcp.rn.f64 %fd1, %fd2;
mul.wide.s32 %rd16, %r2, 4;
mov.u64 %rd17, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_5a46629d_103395arrayE;
add.s64 %rd6, %rd17, %rd16;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r74, %r67, 31;
shr.s32 %r75, %r74, 31;
shr.u32 %r76, %r75, 27;
add.s32 %r77, %r74, %r76;
shr.s32 %r6, %r77, 5;
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd18, %rd12;
mul.wide.s32 %rd19, %r8, 4;
add.s64 %rd8, %rd18, %rd19;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
add.s32 %r82, %r9, %r7;
add.s32 %r83, %r82, 16;
setp.gt.s32 %p4, %r83, 215;
@%p4 bra $L__BB0_7;
and.b32 %r87, %r81, 1073741816;
sub.s32 %r88, %r2, %r87;
shl.b32 %r10, %r88, 2;
rem.s32 %r89, %r5, %r6;
shl.b32 %r11, %r89, 5;
add.s32 %r90, %r10, %r11;
or.b32 %r91, %r90, 3;
setp.ge.s32 %p5, %r91, %r67;
@%p5 bra $L__BB0_7;
shr.u32 %r93, %r79, 27;
add.s32 %r94, %r2, %r93;
and.b32 %r95, %r94, -32;
sub.s32 %r12, %r2, %r95;
add.s32 %r13, %r7, %r12;
setp.lt.s32 %p6, %r13, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f165, [%rd8];
st.shared.f32 [%rd6+4608], %f165;
shl.b32 %r212, %r4, 5;
add.s32 %r213, %r7, %r9;
add.s32 %r214, %r213, %r212;
mad.lo.s32 %r215, %r214, %r67, %r10;
add.s32 %r216, %r215, %r11;
mul.wide.s32 %rd46, %r216, 4;
add.s64 %rd40, %rd4, %rd46;
// begin inline asm
ld.global.cs.v4.u32 {%r192,%r193,%r194,%r195}, [%rd40];
// end inline asm
add.s32 %r217, %r214, 16;
mad.lo.s32 %r218, %r217, %r67, %r10;
add.s32 %r219, %r218, %r11;
mul.wide.s32 %rd47, %r219, 4;
add.s64 %rd41, %rd4, %rd47;
// begin inline asm
ld.global.cs.v4.u32 {%r196,%r197,%r198,%r199}, [%rd41];
// end inline asm
shl.b64 %rd48, %rd7, 2;
add.s64 %rd49, %rd2, %rd48;
ld.global.f32 %f166, [%rd49];
st.shared.f32 [%rd6+4096], %f166;
mul.lo.s32 %r220, %r4, 96;
add.s32 %r221, %r214, %r220;
mad.lo.s32 %r222, %r221, %r67, %r10;
add.s32 %r223, %r222, %r11;
mul.wide.s32 %rd50, %r223, 4;
add.s64 %rd42, %rd3, %rd50;
// begin inline asm
ld.global.cs.v4.u32 {%r200,%r201,%r202,%r203}, [%rd42];
// end inline asm
add.s32 %r224, %r217, %r220;
mad.lo.s32 %r225, %r224, %r67, %r10;
add.s32 %r226, %r225, %r11;
mul.wide.s32 %rd51, %r226, 4;
add.s64 %rd43, %rd3, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r204,%r205,%r206,%r207}, [%rd43];
// end inline asm
mul.lo.s32 %r227, %r13, %r62;
mul.wide.s32 %rd52, %r227, 4;
add.s64 %rd53, %rd1, %rd52;
mul.wide.s32 %rd54, %r12, 4;
add.s64 %rd56, %rd17, %rd54;
ld.global.f32 %f167, [%rd53];
st.shared.f32 [%rd56], %f167;
barrier.sync 0;
mul.wide.s32 %rd57, %r9, 4;
add.s64 %rd58, %rd17, %rd57;
ld.shared.f32 %f168, [%rd58];
cvt.rn.f32.f64 %f169, %fd1;
mul.f32 %f170, %f168, %f169;
mov.b32 %f171, %r200;
ld.shared.f32 %f172, [%rd58+4096];
sub.f32 %f173, %f171, %f172;
mov.b32 %f174, %r192;
ld.shared.f32 %f175, [%rd58+4608];
mul.f32 %f176, %f175, %f174;
sub.f32 %f177, %f173, %f176;
mul.f32 %f157, %f170, %f177;
mov.b32 %f178, %r201;
sub.f32 %f179, %f178, %f172;
mov.b32 %f180, %r193;
mul.f32 %f181, %f175, %f180;
sub.f32 %f182, %f179, %f181;
mul.f32 %f158, %f170, %f182;
// begin inline asm
{ cvt.rn.bf16.f32 %rs22, %f158;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs21, %f157;}
// end inline asm
mov.b32 %r208, {%rs21, %rs22};
mov.b32 %f183, %r202;
sub.f32 %f184, %f183, %f172;
mov.b32 %f185, %r194;
mul.f32 %f186, %f175, %f185;
sub.f32 %f187, %f184, %f186;
mul.f32 %f159, %f170, %f187;
mov.b32 %f188, %r203;
sub.f32 %f189, %f188, %f172;
mov.b32 %f190, %r195;
mul.f32 %f191, %f175, %f190;
sub.f32 %f192, %f189, %f191;
mul.f32 %f160, %f170, %f192;
// begin inline asm
{ cvt.rn.bf16.f32 %rs24, %f160;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs23, %f159;}
// end inline asm
mov.b32 %r209, {%rs23, %rs24};
ld.shared.f32 %f193, [%rd58+64];
mul.f32 %f194, %f193, %f169;
mov.b32 %f195, %r204;
ld.shared.f32 %f196, [%rd58+4160];
sub.f32 %f197, %f195, %f196;
mov.b32 %f198, %r196;
ld.shared.f32 %f199, [%rd58+4672];
mul.f32 %f200, %f199, %f198;
sub.f32 %f201, %f197, %f200;
mul.f32 %f161, %f194, %f201;
mov.b32 %f202, %r205;
sub.f32 %f203, %f202, %f196;
mov.b32 %f204, %r197;
mul.f32 %f205, %f199, %f204;
sub.f32 %f206, %f203, %f205;
mul.f32 %f162, %f194, %f206;
// begin inline asm
{ cvt.rn.bf16.f32 %rs26, %f162;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs25, %f161;}
// end inline asm
mov.b32 %r210, {%rs25, %rs26};
mov.b32 %f207, %r206;
sub.f32 %f208, %f207, %f196;
mov.b32 %f209, %r198;
mul.f32 %f210, %f199, %f209;
sub.f32 %f211, %f208, %f210;
mul.f32 %f163, %f194, %f211;
mov.b32 %f212, %r207;
sub.f32 %f213, %f212, %f196;
mov.b32 %f214, %r199;
mul.f32 %f215, %f199, %f214;
sub.f32 %f216, %f213, %f215;
mul.f32 %f164, %f194, %f216;
// begin inline asm
{ cvt.rn.bf16.f32 %rs28, %f164;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs27, %f163;}
// end inline asm
mov.b32 %r211, {%rs27, %rs28};
mul.lo.s32 %r228, %r4, 896;
add.s32 %r229, %r221, %r228;
mad.lo.s32 %r230, %r229, %r67, %r10;
add.s32 %r231, %r230, %r11;
mul.wide.s32 %rd59, %r231, 2;
add.s64 %rd44, %rd13, %rd59;
// begin inline asm
st.global.cs.v2.s32 [%rd44], {%r208,%r209};
// end inline asm
add.s32 %r232, %r224, %r228;
mad.lo.s32 %r233, %r232, %r67, %r10;
add.s32 %r234, %r233, %r11;
mul.wide.s32 %rd60, %r234, 2;
add.s64 %rd45, %rd13, %rd60;
// begin inline asm
st.global.cs.v2.s32 [%rd45], {%r210,%r211};
// end inline asm
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f82, [%rd8];
st.shared.f32 [%rd6+4608], %f82;
$L__BB0_9:
mov.u32 %r14, %ctaid.x;
add.s32 %r100, %r67, 31;
shr.s32 %r101, %r100, 31;
shr.u32 %r102, %r101, 27;
add.s32 %r103, %r100, %r102;
shr.s32 %r15, %r103, 5;
shl.b32 %r16, %r4, 5;
shr.s32 %r104, %r2, 31;
shr.u32 %r105, %r104, 29;
add.s32 %r106, %r2, %r105;
and.b32 %r107, %r106, 1073741816;
sub.s32 %r108, %r2, %r107;
shl.b32 %r109, %r108, 2;
rem.s32 %r110, %r14, %r15;
shl.b32 %r111, %r110, 5;
add.s32 %r20, %r111, %r109;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r67;
shr.s32 %r18, %r106, 3;
add.s32 %r19, %r18, -216;
mov.u32 %r235, 0;
mov.u32 %r236, %r235;
mov.u32 %r237, %r235;
mov.u32 %r238, %r235;
@%p8 bra $L__BB0_12;
div.s32 %r116, %r14, %r15;
shl.b32 %r21, %r116, 5;
add.s32 %r117, %r19, %r21;
neg.s32 %r118, %r16;
setp.ge.s32 %p9, %r117, %r118;
@%p9 bra $L__BB0_12;
add.s32 %r123, %r16, %r18;
add.s32 %r124, %r123, %r21;
mad.lo.s32 %r125, %r124, %r67, %r20;
mul.wide.s32 %rd21, %r125, 4;
add.s64 %rd20, %rd4, %rd21;
// begin inline asm
ld.global.cs.v4.u32 {%r238,%r237,%r236,%r235}, [%rd20];
// end inline asm
$L__BB0_12:
mov.f32 %f225, 0f00000000;
mov.f32 %f226, 0f00000000;
mov.f32 %f227, 0f00000000;
mov.f32 %f228, 0f00000000;
@%p8 bra $L__BB0_15;
div.s32 %r126, %r14, %r15;
shl.b32 %r30, %r126, 5;
add.s32 %r127, %r19, %r30;
mov.u32 %r128, -16;
sub.s32 %r129, %r128, %r16;
setp.ge.s32 %p11, %r127, %r129;
@%p11 bra $L__BB0_15;
add.s32 %r134, %r16, %r18;
add.s32 %r135, %r134, %r30;
add.s32 %r136, %r135, 16;
mad.lo.s32 %r137, %r136, %r67, %r20;
mul.wide.s32 %rd23, %r137, 4;
add.s64 %rd22, %rd4, %rd23;
// begin inline asm
ld.global.cs.v4.u32 {%r130,%r131,%r132,%r133}, [%rd22];
// end inline asm
mov.b32 %f228, %r130;
mov.b32 %f227, %r131;
mov.b32 %f226, %r132;
mov.b32 %f225, %r133;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
div.s32 %r138, %r14, %r15;
shl.b32 %r139, %r138, 5;
add.s32 %r32, %r139, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
mul.wide.s32 %rd24, %r32, 4;
add.s64 %rd25, %rd2, %rd24;
ld.global.f32 %f91, [%rd25];
st.shared.f32 [%rd6+4096], %f91;
$L__BB0_18:
mov.u32 %r239, 0;
mov.u32 %r240, %r239;
mov.u32 %r241, %r239;
mov.u32 %r242, %r239;
@%p8 bra $L__BB0_21;
div.s32 %r148, %r14, %r15;
shl.b32 %r33, %r148, 5;
add.s32 %r149, %r19, %r33;
neg.s32 %r150, %r31;
setp.ge.s32 %p15, %r149, %r150;
@%p15 bra $L__BB0_21;
add.s32 %r155, %r31, %r18;
add.s32 %r156, %r155, %r33;
mad.lo.s32 %r157, %r156, %r67, %r20;
mul.wide.s32 %rd27, %r157, 4;
add.s64 %rd26, %rd3, %rd27;
// begin inline asm
ld.global.cs.v4.u32 {%r242,%r241,%r240,%r239}, [%rd26];
// end inline asm
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r67;
mov.f32 %f229, 0f00000000;
mov.f32 %f230, 0f00000000;
mov.f32 %f231, 0f00000000;
mov.f32 %f232, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
div.s32 %r158, %r14, %r15;
shl.b32 %r42, %r158, 5;
add.s32 %r159, %r19, %r42;
mov.u32 %r160, -16;
sub.s32 %r161, %r160, %r31;
setp.ge.s32 %p17, %r159, %r161;
@%p17 bra $L__BB0_24;
add.s32 %r166, %r31, %r18;
add.s32 %r167, %r166, %r42;
add.s32 %r168, %r167, 16;
mad.lo.s32 %r169, %r168, %r67, %r20;
mul.wide.s32 %rd29, %r169, 4;
add.s64 %rd28, %rd3, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r162,%r163,%r164,%r165}, [%rd28];
// end inline asm
mov.b32 %f232, %r162;
mov.b32 %f231, %r163;
mov.b32 %f230, %r164;
mov.b32 %f229, %r165;
$L__BB0_24:
div.s32 %r170, %r14, %r15;
shl.b32 %r43, %r170, 5;
shr.u32 %r172, %r104, 27;
add.s32 %r173, %r2, %r172;
and.b32 %r174, %r173, -32;
sub.s32 %r44, %r2, %r174;
add.s32 %r175, %r43, %r44;
setp.gt.s32 %p18, %r175, 215;
mul.lo.s32 %r176, %r175, %r62;
mul.wide.s32 %rd30, %r176, 4;
add.s64 %rd9, %rd1, %rd30;
@%p18 bra $L__BB0_26;
mul.wide.s32 %rd31, %r44, 4;
add.s64 %rd33, %rd17, %rd31;
ld.global.f32 %f100, [%rd9];
st.shared.f32 [%rd33], %f100;
$L__BB0_26:
shl.b32 %r45, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
neg.s32 %r46, %r45;
add.s32 %r47, %r19, %r43;
setp.ge.s32 %p19, %r47, %r46;
mul.wide.s32 %rd34, %r18, 4;
add.s64 %rd10, %rd17, %rd34;
mov.f32 %f234, 0f00000000;
mov.f32 %f233, %f234;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f233, [%rd10+4096];
$L__BB0_28:
mov.b32 %f103, %r242;
sub.f32 %f20, %f103, %f233;
@%p19 bra $L__BB0_30;
ld.shared.f32 %f234, [%rd10+4608];
$L__BB0_30:
mov.b32 %f105, %r238;
mul.f32 %f106, %f234, %f105;
sub.f32 %f23, %f20, %f106;
mov.f32 %f236, 0f00000000;
mov.f32 %f235, %f236;
@%p19 bra $L__BB0_32;
ld.shared.f32 %f107, [%rd10];
mul.f32 %f235, %f107, %f17;
$L__BB0_32:
mul.f32 %f108, %f23, %f235;
// begin inline asm
{ cvt.rn.bf16.f32 %rs13, %f108;}
// end inline asm
@%p19 bra $L__BB0_34;
ld.shared.f32 %f236, [%rd10+4096];
$L__BB0_34:
mov.b32 %f111, %r241;
sub.f32 %f28, %f111, %f236;
mov.f32 %f238, 0f00000000;
mov.f32 %f237, %f238;
@%p19 bra $L__BB0_36;
ld.shared.f32 %f237, [%rd10+4608];
$L__BB0_36:
mov.b32 %f113, %r237;
mul.f32 %f114, %f237, %f113;
sub.f32 %f31, %f28, %f114;
@%p19 bra $L__BB0_38;
ld.shared.f32 %f115, [%rd10];
mul.f32 %f238, %f115, %f17;
$L__BB0_38:
mul.f32 %f116, %f31, %f238;
// begin inline asm
{ cvt.rn.bf16.f32 %rs14, %f116;}
// end inline asm
mov.f32 %f240, 0f00000000;
mov.f32 %f239, %f240;
@%p19 bra $L__BB0_40;
ld.shared.f32 %f239, [%rd10+4096];
$L__BB0_40:
mov.b32 %f119, %r240;
sub.f32 %f36, %f119, %f239;
@%p19 bra $L__BB0_42;
ld.shared.f32 %f240, [%rd10+4608];
$L__BB0_42:
mov.b32 %f121, %r236;
mul.f32 %f122, %f240, %f121;
sub.f32 %f39, %f36, %f122;
mov.f32 %f242, 0f00000000;
mov.f32 %f241, %f242;
@%p19 bra $L__BB0_44;
ld.shared.f32 %f123, [%rd10];
mul.f32 %f241, %f123, %f17;
$L__BB0_44:
mul.f32 %f124, %f39, %f241;
// begin inline asm
{ cvt.rn.bf16.f32 %rs15, %f124;}
// end inline asm
@%p19 bra $L__BB0_46;
ld.shared.f32 %f242, [%rd10+4096];
$L__BB0_46:
mov.b32 %f127, %r239;
sub.f32 %f44, %f127, %f242;
mov.f32 %f244, 0f00000000;
mov.f32 %f243, %f244;
@%p19 bra $L__BB0_48;
ld.shared.f32 %f243, [%rd10+4608];
$L__BB0_48:
mov.b32 %f129, %r235;
mul.f32 %f130, %f243, %f129;
sub.f32 %f47, %f44, %f130;
@%p19 bra $L__BB0_50;
ld.shared.f32 %f131, [%rd10];
mul.f32 %f244, %f131, %f17;
$L__BB0_50:
mul.f32 %f132, %f47, %f244;
// begin inline asm
{ cvt.rn.bf16.f32 %rs16, %f132;}
// end inline asm
mov.u32 %r177, -16;
sub.s32 %r48, %r177, %r45;
setp.ge.s32 %p31, %r47, %r48;
mov.f32 %f246, 0f00000000;
mov.f32 %f245, %f246;
@%p31 bra $L__BB0_52;
ld.shared.f32 %f245, [%rd10+4160];
$L__BB0_52:
sub.f32 %f52, %f232, %f245;
@%p31 bra $L__BB0_54;
ld.shared.f32 %f246, [%rd10+4672];
$L__BB0_54:
mul.f32 %f136, %f246, %f228;
sub.f32 %f55, %f52, %f136;
mov.f32 %f248, 0f00000000;
mov.f32 %f247, %f248;
@%p31 bra $L__BB0_56;
ld.shared.f32 %f137, [%rd10+64];
mul.f32 %f247, %f137, %f17;
$L__BB0_56:
mul.f32 %f138, %f55, %f247;
// begin inline asm
{ cvt.rn.bf16.f32 %rs17, %f138;}
// end inline asm
@%p31 bra $L__BB0_58;
ld.shared.f32 %f248, [%rd10+4160];
$L__BB0_58:
sub.f32 %f60, %f231, %f248;
mov.f32 %f250, 0f00000000;
mov.f32 %f249, %f250;
@%p31 bra $L__BB0_60;
ld.shared.f32 %f249, [%rd10+4672];
$L__BB0_60:
mul.f32 %f142, %f249, %f227;
sub.f32 %f63, %f60, %f142;
@%p31 bra $L__BB0_62;
ld.shared.f32 %f143, [%rd10+64];
mul.f32 %f250, %f143, %f17;
$L__BB0_62:
mul.f32 %f144, %f63, %f250;
// begin inline asm
{ cvt.rn.bf16.f32 %rs18, %f144;}
// end inline asm
mov.f32 %f252, 0f00000000;
mov.f32 %f251, %f252;
@%p31 bra $L__BB0_64;
ld.shared.f32 %f251, [%rd10+4160];
$L__BB0_64:
sub.f32 %f68, %f230, %f251;
@%p31 bra $L__BB0_66;
ld.shared.f32 %f252, [%rd10+4672];
$L__BB0_66:
mul.f32 %f148, %f252, %f226;
sub.f32 %f71, %f68, %f148;
mov.f32 %f254, 0f00000000;
mov.f32 %f253, %f254;
@%p31 bra $L__BB0_68;
ld.shared.f32 %f149, [%rd10+64];
mul.f32 %f253, %f149, %f17;
$L__BB0_68:
mul.f32 %f150, %f71, %f253;
// begin inline asm
{ cvt.rn.bf16.f32 %rs19, %f150;}
// end inline asm
@%p31 bra $L__BB0_70;
ld.shared.f32 %f254, [%rd10+4160];
$L__BB0_70:
sub.f32 %f76, %f229, %f254;
mov.f32 %f256, 0f00000000;
mov.f32 %f255, %f256;
@%p31 bra $L__BB0_72;
ld.shared.f32 %f255, [%rd10+4672];
$L__BB0_72:
mul.f32 %f154, %f255, %f225;
sub.f32 %f79, %f76, %f154;
@%p31 bra $L__BB0_74;
ld.shared.f32 %f155, [%rd10+64];
mul.f32 %f256, %f155, %f17;
$L__BB0_74:
mul.f32 %f156, %f79, %f256;
// begin inline asm
{ cvt.rn.bf16.f32 %rs20, %f156;}
// end inline asm
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
neg.s32 %r178, %r49;
setp.ge.s32 %p44, %r47, %r178;
@%p44 bra $L__BB0_77;
add.s32 %r181, %r49, %r18;
add.s32 %r182, %r181, %r43;
mad.lo.s32 %r183, %r182, %r67, %r20;
mul.wide.s32 %rd37, %r183, 2;
add.s64 %rd36, %rd13, %rd37;
mov.b32 %r180, {%rs15, %rs16};
mov.b32 %r179, {%rs13, %rs14};
// begin inline asm
st.global.cs.v2.s32 [%rd36], {%r179,%r180};
// end inline asm
$L__BB0_77:
mov.u32 %r184, -16;
sub.s32 %r185, %r184, %r49;
setp.ge.s32 %p46, %r47, %r185;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
add.s32 %r188, %r49, %r18;
add.s32 %r189, %r188, %r43;
add.s32 %r190, %r189, 16;
mad.lo.s32 %r191, %r190, %r67, %r20;
mul.wide.s32 %rd39, %r191, 2;
add.s64 %rd38, %rd13, %rd39;
mov.b32 %r187, {%rs19, %rs20};
mov.b32 %r186, {%rs17, %rs18};
// begin inline asm
st.global.cs.v2.s32 [%rd38], {%r186,%r187};
// end inline asm
$L__BB0_80:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
// _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s has been demoted
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_1[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_2[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5[24]
)
{
.reg .pred %p<48>;
.reg .b16 %rs<29>;
.reg .f32 %f<269>;
.reg .b32 %r<266>;
.reg .f64 %fd<3>;
.reg .b64 %rd<69>;
// demoted variable
.shared .align 4 .u32 _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0+16];
ld.param.v2.u32 {%r66, %r67}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4+8];
ld.param.u64 %rd15, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5];
ld.param.u64 %rd14, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_1];
ld.param.u64 %rd13, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0];
ld.param.u64 %rd3, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_3];
ld.param.u64 %rd16, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_2];
cvta.to.global.u64 %rd1, %rd13;
cvta.to.global.u64 %rd2, %rd16;
ld.param.u64 %rd4, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
mov.u64 %rd17, _ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
atom.shared.min.s32 %r73, [%rd17], %r2;
ld.shared.u32 %r4, [_ZZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_723310nvfuser_52ENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r67;
rcp.rn.f64 %fd1, %fd2;
mul.wide.s32 %rd18, %r2, 4;
mov.u64 %rd19, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_52_cu_4d740913_72335arrayE;
add.s64 %rd6, %rd19, %rd18;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
mov.u32 %r5, %ctaid.x;
add.s32 %r74, %r67, 31;
shr.s32 %r75, %r74, 31;
shr.u32 %r76, %r75, 27;
add.s32 %r77, %r74, %r76;
shr.s32 %r6, %r77, 5;
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
cvta.to.global.u64 %rd20, %rd14;
mul.wide.s32 %rd21, %r8, 4;
add.s64 %rd8, %rd20, %rd21;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
add.s32 %r10, %r9, 16;
add.s32 %r82, %r10, %r7;
setp.gt.s32 %p4, %r82, 215;
@%p4 bra $L__BB0_7;
rem.s32 %r83, %r5, %r6;
shl.b32 %r11, %r83, 5;
or.b32 %r84, %r11, 31;
setp.ge.s32 %p5, %r84, %r67;
@%p5 bra $L__BB0_7;
shr.u32 %r86, %r79, 27;
add.s32 %r87, %r2, %r86;
and.b32 %r88, %r87, -32;
sub.s32 %r89, %r2, %r88;
add.s32 %r12, %r7, %r89;
setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f165, [%rd8];
st.shared.f32 [%rd6+4608], %f165;
and.b32 %r220, %r81, -8;
sub.s32 %r221, %r2, %r220;
shl.b32 %r222, %r221, 2;
shl.b32 %r223, %r4, 5;
add.s32 %r224, %r7, %r9;
add.s32 %r225, %r224, %r223;
mad.lo.s32 %r226, %r225, %r67, %r222;
add.s32 %r227, %r226, %r11;
mul.wide.s32 %rd51, %r227, 4;
add.s64 %rd45, %rd4, %rd51;
// begin inline asm
ld.global.cs.v4.u32 {%r197,%r198,%r199,%r200}, [%rd45];
// end inline asm
add.s32 %r228, %r225, 16;
mad.lo.s32 %r229, %r228, %r67, %r222;
add.s32 %r230, %r229, %r11;
mul.wide.s32 %rd52, %r230, 4;
add.s64 %rd46, %rd4, %rd52;
// begin inline asm
ld.global.cs.v4.u32 {%r201,%r202,%r203,%r204}, [%rd46];
// end inline asm
shl.b64 %rd53, %rd7, 2;
add.s64 %rd54, %rd2, %rd53;
ld.global.f32 %f166, [%rd54];
st.shared.f32 [%rd6+4096], %f166;
mul.lo.s32 %r231, %r4, 96;
add.s32 %r232, %r225, %r231;
mad.lo.s32 %r233, %r232, %r67, %r222;
add.s32 %r234, %r233, %r11;
mul.wide.s32 %rd55, %r234, 4;
add.s64 %rd47, %rd3, %rd55;
// begin inline asm
ld.global.cs.v4.u32 {%r205,%r206,%r207,%r208}, [%rd47];
// end inline asm
add.s32 %r235, %r228, %r231;
mad.lo.s32 %r236, %r235, %r67, %r222;
add.s32 %r237, %r236, %r11;
mul.wide.s32 %rd56, %r237, 4;
add.s64 %rd48, %rd3, %rd56;
// begin inline asm
ld.global.cs.v4.u32 {%r209,%r210,%r211,%r212}, [%rd48];
// end inline asm
mul.lo.s32 %r238, %r12, %r62;
mul.wide.s32 %rd57, %r238, 4;
add.s64 %rd58, %rd1, %rd57;
ld.global.f32 %f167, [%rd58];
st.shared.f32 [%rd6], %f167;
st.shared.f32 [%rd6+512], %f167;
st.shared.f32 [%rd6+1024], %f167;
st.shared.f32 [%rd6+1536], %f167;
st.shared.f32 [%rd6+2048], %f167;
st.shared.f32 [%rd6+2560], %f167;
st.shared.f32 [%rd6+3072], %f167;
st.shared.f32 [%rd6+3584], %f167;
barrier.sync 0;
shl.b32 %r239, %r221, 7;
add.s32 %r240, %r239, %r9;
shr.s32 %r241, %r9, 31;
shr.u32 %r242, %r241, 25;
add.s32 %r243, %r9, %r242;
and.b32 %r244, %r243, -128;
sub.s32 %r245, %r9, %r244;
mul.wide.s32 %rd59, %r245, 4;
add.s64 %rd61, %rd19, 4096;
add.s64 %rd62, %rd61, %rd59;
mov.b32 %f168, %r205;
ld.shared.f32 %f169, [%rd62];
sub.f32 %f170, %f168, %f169;
mov.b32 %f171, %r197;
ld.shared.f32 %f172, [%rd62+512];
mul.f32 %f173, %f172, %f171;
sub.f32 %f174, %f170, %f173;
mul.wide.s32 %rd63, %r240, 4;
add.s64 %rd64, %rd19, %rd63;
ld.shared.f32 %f175, [%rd64];
cvt.rn.f32.f64 %f176, %fd1;
mul.f32 %f177, %f175, %f176;
mul.f32 %f157, %f177, %f174;
mov.b32 %f178, %r206;
sub.f32 %f179, %f178, %f169;
mov.b32 %f180, %r198;
mul.f32 %f181, %f172, %f180;
sub.f32 %f182, %f179, %f181;
ld.shared.f32 %f183, [%rd64+128];
mul.f32 %f184, %f183, %f176;
mul.f32 %f158, %f184, %f182;
// begin inline asm
{ cvt.rn.bf16.f32 %rs22, %f158;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs21, %f157;}
// end inline asm
mov.b32 %r213, {%rs21, %rs22};
mov.b32 %f185, %r207;
sub.f32 %f186, %f185, %f169;
mov.b32 %f187, %r199;
mul.f32 %f188, %f172, %f187;
sub.f32 %f189, %f186, %f188;
ld.shared.f32 %f190, [%rd64+256];
mul.f32 %f191, %f190, %f176;
mul.f32 %f159, %f191, %f189;
mov.b32 %f192, %r208;
sub.f32 %f193, %f192, %f169;
mov.b32 %f194, %r200;
mul.f32 %f195, %f172, %f194;
sub.f32 %f196, %f193, %f195;
ld.shared.f32 %f197, [%rd64+384];
mul.f32 %f198, %f197, %f176;
mul.f32 %f160, %f198, %f196;
// begin inline asm
{ cvt.rn.bf16.f32 %rs24, %f160;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs23, %f159;}
// end inline asm
mov.b32 %r214, {%rs23, %rs24};
shr.s32 %r246, %r10, 31;
shr.u32 %r247, %r246, 25;
add.s32 %r248, %r10, %r247;
and.b32 %r249, %r248, -128;
sub.s32 %r250, %r10, %r249;
mul.wide.s32 %rd65, %r250, 4;
add.s64 %rd66, %rd61, %rd65;
mov.b32 %f199, %r209;
ld.shared.f32 %f200, [%rd66];
sub.f32 %f201, %f199, %f200;
mov.b32 %f202, %r201;
ld.shared.f32 %f203, [%rd66+512];
mul.f32 %f204, %f203, %f202;
sub.f32 %f205, %f201, %f204;
ld.shared.f32 %f206, [%rd64+64];
mul.f32 %f207, %f206, %f176;
mul.f32 %f161, %f207, %f205;
mov.b32 %f208, %r210;
sub.f32 %f209, %f208, %f200;
mov.b32 %f210, %r202;
mul.f32 %f211, %f203, %f210;
sub.f32 %f212, %f209, %f211;
ld.shared.f32 %f213, [%rd64+192];
mul.f32 %f214, %f213, %f176;
mul.f32 %f162, %f214, %f212;
// begin inline asm
{ cvt.rn.bf16.f32 %rs26, %f162;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs25, %f161;}
// end inline asm
mov.b32 %r215, {%rs25, %rs26};
mov.b32 %f215, %r211;
sub.f32 %f216, %f215, %f200;
mov.b32 %f217, %r203;
mul.f32 %f218, %f203, %f217;
sub.f32 %f219, %f216, %f218;
ld.shared.f32 %f220, [%rd64+320];
mul.f32 %f221, %f220, %f176;
mul.f32 %f163, %f221, %f219;
mov.b32 %f222, %r212;
sub.f32 %f223, %f222, %f200;
mov.b32 %f224, %r204;
mul.f32 %f225, %f203, %f224;
sub.f32 %f226, %f223, %f225;
ld.shared.f32 %f227, [%rd64+448];
mul.f32 %f228, %f227, %f176;
mul.f32 %f164, %f228, %f226;
// begin inline asm
{ cvt.rn.bf16.f32 %rs28, %f164;}
// end inline asm
// begin inline asm
{ cvt.rn.bf16.f32 %rs27, %f163;}
// end inline asm
mov.b32 %r216, {%rs27, %rs28};
shl.b32 %r251, %r4, 10;
add.s32 %r252, %r224, %r251;
mad.lo.s32 %r253, %r252, %r67, %r222;
add.s32 %r254, %r253, %r11;
mul.wide.s32 %rd67, %r254, 2;
add.s64 %rd49, %rd15, %rd67;
// begin inline asm
st.global.cs.v2.s32 [%rd49], {%r213,%r214};
// end inline asm
add.s32 %r255, %r252, 16;
mad.lo.s32 %r256, %r255, %r67, %r222;
add.s32 %r257, %r256, %r11;
mul.wide.s32 %rd68, %r257, 2;
add.s64 %rd50, %rd15, %rd68;
// begin inline asm
st.global.cs.v2.s32 [%rd50], {%r215,%r216};
// end inline asm
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f82, [%rd8];
st.shared.f32 [%rd6+4608], %f82;
$L__BB0_9:
mov.u32 %r13, %ctaid.x;
add.s32 %r94, %r67, 31;
shr.s32 %r95, %r94, 31;
shr.u32 %r96, %r95, 27;
add.s32 %r97, %r94, %r96;
shr.s32 %r14, %r97, 5;
shl.b32 %r15, %r4, 5;
shr.s32 %r98, %r2, 31;
shr.u32 %r99, %r98, 29;
add.s32 %r100, %r2, %r99;
and.b32 %r101, %r100, -8;
sub.s32 %r16, %r2, %r101;
shl.b32 %r102, %r16, 2;
rem.s32 %r103, %r13, %r14;
shl.b32 %r104, %r103, 5;
add.s32 %r20, %r104, %r102;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r67;
shr.s32 %r18, %r100, 3;
add.s32 %r19, %r18, -216;
mov.u32 %r258, 0;
mov.u32 %r259, %r258;
mov.u32 %r260, %r258;
mov.u32 %r261, %r258;
@%p8 bra $L__BB0_12;
div.s32 %r109, %r13, %r14;
shl.b32 %r21, %r109, 5;
add.s32 %r110, %r19, %r21;
neg.s32 %r111, %r15;
setp.ge.s32 %p9, %r110, %r111;
@%p9 bra $L__BB0_12;
add.s32 %r116, %r15, %r18;
add.s32 %r117, %r116, %r21;
mad.lo.s32 %r118, %r117, %r67, %r20;
mul.wide.s32 %rd23, %r118, 4;
add.s64 %rd22, %rd4, %rd23;
// begin inline asm
ld.global.cs.v4.u32 {%r261,%r260,%r259,%r258}, [%rd22];
// end inline asm
$L__BB0_12:
mov.f32 %f237, 0f00000000;
mov.f32 %f238, 0f00000000;
mov.f32 %f239, 0f00000000;
mov.f32 %f240, 0f00000000;
@%p8 bra $L__BB0_15;
div.s32 %r119, %r13, %r14;
shl.b32 %r30, %r119, 5;
add.s32 %r120, %r19, %r30;
mov.u32 %r121, -16;
sub.s32 %r122, %r121, %r15;
setp.ge.s32 %p11, %r120, %r122;
@%p11 bra $L__BB0_15;
add.s32 %r127, %r15, %r18;
add.s32 %r128, %r127, %r30;
add.s32 %r129, %r128, 16;
mad.lo.s32 %r130, %r129, %r67, %r20;
mul.wide.s32 %rd25, %r130, 4;
add.s64 %rd24, %rd4, %rd25;
// begin inline asm
ld.global.cs.v4.u32 {%r123,%r124,%r125,%r126}, [%rd24];
// end inline asm
mov.b32 %f240, %r123;
mov.b32 %f239, %r124;
mov.b32 %f238, %r125;
mov.b32 %f237, %r126;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
div.s32 %r131, %r13, %r14;
shl.b32 %r132, %r131, 5;
add.s32 %r32, %r132, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
mul.wide.s32 %rd26, %r32, 4;
add.s64 %rd27, %rd2, %rd26;
ld.global.f32 %f91, [%rd27];
st.shared.f32 [%rd6+4096], %f91;
$L__BB0_18:
mov.u32 %r262, 0;
mov.u32 %r263, %r262;
mov.u32 %r264, %r262;
mov.u32 %r265, %r262;
@%p8 bra $L__BB0_21;
div.s32 %r141, %r13, %r14;
shl.b32 %r33, %r141, 5;
add.s32 %r142, %r19, %r33;
neg.s32 %r143, %r31;
setp.ge.s32 %p15, %r142, %r143;
@%p15 bra $L__BB0_21;
add.s32 %r148, %r31, %r18;
add.s32 %r149, %r148, %r33;
mad.lo.s32 %r150, %r149, %r67, %r20;
mul.wide.s32 %rd29, %r150, 4;
add.s64 %rd28, %rd3, %rd29;
// begin inline asm
ld.global.cs.v4.u32 {%r265,%r264,%r263,%r262}, [%rd28];
// end inline asm
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r67;
mov.f32 %f241, 0f00000000;
mov.f32 %f242, 0f00000000;
mov.f32 %f243, 0f00000000;
mov.f32 %f244, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
div.s32 %r151, %r13, %r14;
shl.b32 %r42, %r151, 5;
add.s32 %r152, %r19, %r42;
mov.u32 %r153, -16;
sub.s32 %r154, %r153, %r31;
setp.ge.s32 %p17, %r152, %r154;
@%p17 bra $L__BB0_24;
add.s32 %r159, %r31, %r18;
add.s32 %r160, %r159, %r42;
add.s32 %r161, %r160, 16;
mad.lo.s32 %r162, %r161, %r67, %r20;
mul.wide.s32 %rd31, %r162, 4;
add.s64 %rd30, %rd3, %rd31;
// begin inline asm
ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd30];
// end inline asm
mov.b32 %f244, %r155;
mov.b32 %f243, %r156;
mov.b32 %f242, %r157;
mov.b32 %f241, %r158;
$L__BB0_24:
div.s32 %r163, %r13, %r14;
shl.b32 %r43, %r163, 5;
shr.u32 %r165, %r98, 27;
add.s32 %r166, %r2, %r165;
and.b32 %r167, %r166, -32;
sub.s32 %r168, %r2, %r167;
add.s32 %r169, %r43, %r168;
setp.gt.s32 %p18, %r169, 215;
mul.lo.s32 %r170, %r169, %r62;
mul.wide.s32 %rd32, %r170, 4;
add.s64 %rd9, %rd1, %rd32;
@%p18 bra $L__BB0_26;
ld.global.f32 %f100, [%rd9];
st.shared.f32 [%rd6], %f100;
st.shared.f32 [%rd6+512], %f100;
st.shared.f32 [%rd6+1024], %f100;
st.shared.f32 [%rd6+1536], %f100;
st.shared.f32 [%rd6+2048], %f100;
st.shared.f32 [%rd6+2560], %f100;
st.shared.f32 [%rd6+3072], %f100;
st.shared.f32 [%rd6+3584], %f100;
$L__BB0_26:
shl.b32 %r44, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
neg.s32 %r45, %r44;
add.s32 %r46, %r19, %r43;
setp.ge.s32 %p19, %r46, %r45;
shr.s32 %r171, %r18, 31;
shr.u32 %r172, %r171, 25;
add.s32 %r173, %r18, %r172;
and.b32 %r174, %r173, -128;
sub.s32 %r175, %r18, %r174;
mul.wide.s32 %rd33, %r175, 4;
add.s64 %rd35, %rd19, %rd33;
add.s64 %rd10, %rd35, 4096;
mov.f32 %f246, 0f00000000;
mov.f32 %f245, %f246;
@%p19 bra $L__BB0_28;
ld.shared.f32 %f245, [%rd10];
$L__BB0_28:
mov.b32 %f103, %r265;
sub.f32 %f20, %f103, %f245;
@%p19 bra $L__BB0_30;
ld.shared.f32 %f246, [%rd10+512];
$L__BB0_30:
mov.b32 %f105, %r261;
mul.f32 %f106, %f246, %f105;
sub.f32 %f23, %f20, %f106;
shl.b32 %r176, %r16, 7;
add.s32 %r177, %r176, %r18;
mul.wide.s32 %rd36, %r177, 4;
add.s64 %rd11, %rd19, %rd36;
mov.f32 %f248, 0f00000000;
mov.f32 %f247, %f248;
@%p19 bra $L__BB0_32;
ld.shared.f32 %f107, [%rd11];
mul.f32 %f247, %f107, %f17;
$L__BB0_32:
mul.f32 %f108, %f23, %f247;
// begin inline asm
{ cvt.rn.bf16.f32 %rs13, %f108;}
// end inline asm
@%p19 bra $L__BB0_34;
ld.shared.f32 %f248, [%rd10];
$L__BB0_34:
mov.b32 %f111, %r264;
sub.f32 %f28, %f111, %f248;
mov.f32 %f250, 0f00000000;
mov.f32 %f249, %f250;
@%p19 bra $L__BB0_36;
ld.shared.f32 %f249, [%rd10+512];
$L__BB0_36:
mov.b32 %f113, %r260;
mul.f32 %f114, %f249, %f113;
sub.f32 %f31, %f28, %f114;
@%p19 bra $L__BB0_38;
ld.shared.f32 %f115, [%rd11+128];
mul.f32 %f250, %f115, %f17;
$L__BB0_38:
mul.f32 %f116, %f31, %f250;
// begin inline asm
{ cvt.rn.bf16.f32 %rs14, %f116;}
// end inline asm
mov.f32 %f252, 0f00000000;
mov.f32 %f251, %f252;
@%p19 bra $L__BB0_40;
ld.shared.f32 %f251, [%rd10];
$L__BB0_40:
mov.b32 %f119, %r263;
sub.f32 %f36, %f119, %f251;
@%p19 bra $L__BB0_42;
ld.shared.f32 %f252, [%rd10+512];
$L__BB0_42:
mov.b32 %f121, %r259;
mul.f32 %f122, %f252, %f121;
sub.f32 %f39, %f36, %f122;
mov.f32 %f254, 0f00000000;
mov.f32 %f253, %f254;
@%p19 bra $L__BB0_44;
ld.shared.f32 %f123, [%rd11+256];
mul.f32 %f253, %f123, %f17;
$L__BB0_44:
mul.f32 %f124, %f39, %f253;
// begin inline asm
{ cvt.rn.bf16.f32 %rs15, %f124;}
// end inline asm
@%p19 bra $L__BB0_46;
ld.shared.f32 %f254, [%rd10];
$L__BB0_46:
mov.b32 %f127, %r262;
sub.f32 %f44, %f127, %f254;
mov.f32 %f256, 0f00000000;
mov.f32 %f255, %f256;
@%p19 bra $L__BB0_48;
ld.shared.f32 %f255, [%rd10+512];
$L__BB0_48:
mov.b32 %f129, %r258;
mul.f32 %f130, %f255, %f129;
sub.f32 %f47, %f44, %f130;
@%p19 bra $L__BB0_50;
ld.shared.f32 %f131, [%rd11+384];
mul.f32 %f256, %f131, %f17;
$L__BB0_50:
mul.f32 %f132, %f47, %f256;
// begin inline asm
{ cvt.rn.bf16.f32 %rs16, %f132;}
// end inline asm
mov.u32 %r178, -16;
sub.s32 %r47, %r178, %r44;
setp.ge.s32 %p31, %r46, %r47;
add.s32 %r48, %r18, 16;
shr.s32 %r179, %r48, 31;
shr.u32 %r180, %r179, 25;
add.s32 %r181, %r48, %r180;
and.b32 %r182, %r181, -128;
sub.s32 %r183, %r48, %r182;
mul.wide.s32 %rd38, %r183, 4;
add.s64 %rd40, %rd19, %rd38;
add.s64 %rd12, %rd40, 4096;
mov.f32 %f258, 0f00000000;
mov.f32 %f257, %f258;
@%p31 bra $L__BB0_52;
ld.shared.f32 %f257, [%rd12];
$L__BB0_52:
sub.f32 %f52, %f244, %f257;
@%p31 bra $L__BB0_54;
ld.shared.f32 %f258, [%rd12+512];
$L__BB0_54:
mul.f32 %f136, %f258, %f240;
sub.f32 %f55, %f52, %f136;
mov.f32 %f260, 0f00000000;
mov.f32 %f259, %f260;
@%p31 bra $L__BB0_56;
ld.shared.f32 %f137, [%rd11+64];
mul.f32 %f259, %f137, %f17;
$L__BB0_56:
mul.f32 %f138, %f55, %f259;
// begin inline asm
{ cvt.rn.bf16.f32 %rs17, %f138;}
// end inline asm
@%p31 bra $L__BB0_58;
ld.shared.f32 %f260, [%rd12];
$L__BB0_58:
sub.f32 %f60, %f243, %f260;
mov.f32 %f262, 0f00000000;
mov.f32 %f261, %f262;
@%p31 bra $L__BB0_60;
ld.shared.f32 %f261, [%rd12+512];
$L__BB0_60:
mul.f32 %f142, %f261, %f239;
sub.f32 %f63, %f60, %f142;
@%p31 bra $L__BB0_62;
ld.shared.f32 %f143, [%rd11+192];
mul.f32 %f262, %f143, %f17;
$L__BB0_62:
mul.f32 %f144, %f63, %f262;
// begin inline asm
{ cvt.rn.bf16.f32 %rs18, %f144;}
// end inline asm
mov.f32 %f264, 0f00000000;
mov.f32 %f263, %f264;
@%p31 bra $L__BB0_64;
ld.shared.f32 %f263, [%rd12];
$L__BB0_64:
sub.f32 %f68, %f242, %f263;
@%p31 bra $L__BB0_66;
ld.shared.f32 %f264, [%rd12+512];
$L__BB0_66:
mul.f32 %f148, %f264, %f238;
sub.f32 %f71, %f68, %f148;
mov.f32 %f266, 0f00000000;
mov.f32 %f265, %f266;
@%p31 bra $L__BB0_68;
ld.shared.f32 %f149, [%rd11+320];
mul.f32 %f265, %f149, %f17;
$L__BB0_68:
mul.f32 %f150, %f71, %f265;
// begin inline asm
{ cvt.rn.bf16.f32 %rs19, %f150;}
// end inline asm
@%p31 bra $L__BB0_70;
ld.shared.f32 %f266, [%rd12];
$L__BB0_70:
sub.f32 %f76, %f241, %f266;
mov.f32 %f268, 0f00000000;
mov.f32 %f267, %f268;
@%p31 bra $L__BB0_72;
ld.shared.f32 %f267, [%rd12+512];
$L__BB0_72:
mul.f32 %f154, %f267, %f237;
sub.f32 %f79, %f76, %f154;
@%p31 bra $L__BB0_74;
ld.shared.f32 %f155, [%rd11+448];
mul.f32 %f268, %f155, %f17;
$L__BB0_74:
mul.f32 %f156, %f79, %f268;
// begin inline asm
{ cvt.rn.bf16.f32 %rs20, %f156;}
// end inline asm
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
neg.s32 %r184, %r49;
setp.ge.s32 %p44, %r46, %r184;
@%p44 bra $L__BB0_77;
add.s32 %r187, %r49, %r18;
add.s32 %r188, %r187, %r43;
mad.lo.s32 %r189, %r188, %r67, %r20;
mul.wide.s32 %rd42, %r189, 2;
add.s64 %rd41, %rd15, %rd42;
mov.b32 %r186, {%rs15, %rs16};
mov.b32 %r185, {%rs13, %rs14};
// begin inline asm
st.global.cs.v2.s32 [%rd41], {%r185,%r186};
// end inline asm
$L__BB0_77:
mov.u32 %r190, -16;
sub.s32 %r191, %r190, %r49;
setp.ge.s32 %p46, %r46, %r191;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
add.s32 %r194, %r48, %r49;
add.s32 %r195, %r194, %r43;
mad.lo.s32 %r196, %r195, %r67, %r20;
mul.wide.s32 %rd44, %r196, 2;
add.s64 %rd43, %rd15, %rd44;
mov.b32 %r193, {%rs19, %rs20};
mov.b32 %r192, {%rs17, %rs18};
// begin inline asm
st.global.cs.v2.s32 [%rd43], {%r192,%r193};
// end inline asm
$L__BB0_80:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -25,44 +25,44 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5[24]
)
{
.reg .pred %p<48>;
.reg .b16 %rs<29>;
- .reg .f32 %f<257>;
- .reg .b32 %r<243>;
+ .reg .f32 %f<269>;
+ .reg .b32 %r<266>;
.reg .f64 %fd<3>;
- .reg .b64 %rd<61>;
+ .reg .b64 %rd<69>;
.shared .align 4 .u32 _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
ld.param.v2.u32 {%r62, %r63}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0+16];
ld.param.v2.u32 {%r66, %r67}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4+8];
- ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5];
- ld.param.u64 %rd12, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_1];
- ld.param.u64 %rd11, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0];
+ ld.param.u64 %rd15, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_5];
+ ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_1];
+ ld.param.u64 %rd13, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_0];
ld.param.u64 %rd3, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_3];
- ld.param.u64 %rd14, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_2];
- cvta.to.global.u64 %rd1, %rd11;
- cvta.to.global.u64 %rd2, %rd14;
+ ld.param.u64 %rd16, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_2];
+ cvta.to.global.u64 %rd1, %rd13;
+ cvta.to.global.u64 %rd2, %rd16;
ld.param.u64 %rd4, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEE_param_4];
mov.u32 %r2, %tid.x;
setp.ne.s32 %p1, %r2, 0;
@%p1 bra $L__BB0_2;
mov.u32 %r72, 0;
st.shared.u32 [_ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s], %r72;
$L__BB0_2:
bar.sync 0;
- mov.u64 %rd15, _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
- atom.shared.min.s32 %r73, [%rd15], %r2;
+ mov.u64 %rd17, _ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s;
+ atom.shared.min.s32 %r73, [%rd17], %r2;
ld.shared.u32 %r4, [_ZZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEENS0_IfLi1ELi1EEES2_S1_S1_NS0_INS_8__bfloatELi2ELi2EEEE14nvfuser_zero_s];
cvt.rn.f64.s32 %fd2, %r67;
rcp.rn.f64 %fd1, %fd2;
- mul.wide.s32 %rd16, %r2, 4;
- mov.u64 %rd17, _ZN11kernelscope6kernelE;
- add.s64 %rd6, %rd17, %rd16;
+ mul.wide.s32 %rd18, %r2, 4;
+ mov.u64 %rd19, _ZN11kernelscope6kernelE;
+ add.s64 %rd6, %rd19, %rd18;
setp.lt.s32 %p2, %r2, 32;
@%p2 bra $L__BB0_3;
bra.uni $L__BB0_9;
$L__BB0_3:
@@ -75,646 +75,702 @@
div.s32 %r78, %r5, %r6;
shl.b32 %r7, %r78, 5;
add.s32 %r8, %r7, %r2;
setp.gt.s32 %p3, %r8, 215;
cvt.s64.s32 %rd7, %r8;
- cvta.to.global.u64 %rd18, %rd12;
- mul.wide.s32 %rd19, %r8, 4;
- add.s64 %rd8, %rd18, %rd19;
+ cvta.to.global.u64 %rd20, %rd14;
+ mul.wide.s32 %rd21, %r8, 4;
+ add.s64 %rd8, %rd20, %rd21;
@%p3 bra $L__BB0_7;
shr.s32 %r79, %r2, 31;
shr.u32 %r80, %r79, 29;
add.s32 %r81, %r2, %r80;
shr.s32 %r9, %r81, 3;
- add.s32 %r82, %r9, %r7;
- add.s32 %r83, %r82, 16;
- setp.gt.s32 %p4, %r83, 215;
+ add.s32 %r10, %r9, 16;
+ add.s32 %r82, %r10, %r7;
+ setp.gt.s32 %p4, %r82, 215;
@%p4 bra $L__BB0_7;
- and.b32 %r87, %r81, 1073741816;
- sub.s32 %r88, %r2, %r87;
- shl.b32 %r10, %r88, 2;
- rem.s32 %r89, %r5, %r6;
- shl.b32 %r11, %r89, 5;
- add.s32 %r90, %r10, %r11;
- or.b32 %r91, %r90, 3;
- setp.ge.s32 %p5, %r91, %r67;
+ rem.s32 %r83, %r5, %r6;
+ shl.b32 %r11, %r83, 5;
+ or.b32 %r84, %r11, 31;
+ setp.ge.s32 %p5, %r84, %r67;
@%p5 bra $L__BB0_7;
- shr.u32 %r93, %r79, 27;
- add.s32 %r94, %r2, %r93;
- and.b32 %r95, %r94, -32;
- sub.s32 %r12, %r2, %r95;
- add.s32 %r13, %r7, %r12;
- setp.lt.s32 %p6, %r13, 216;
+ shr.u32 %r86, %r79, 27;
+ add.s32 %r87, %r2, %r86;
+ and.b32 %r88, %r87, -32;
+ sub.s32 %r89, %r2, %r88;
+ add.s32 %r12, %r7, %r89;
+ setp.lt.s32 %p6, %r12, 216;
@%p6 bra $L__BB0_79;
bra.uni $L__BB0_7;
$L__BB0_79:
ld.global.f32 %f165, [%rd8];
st.shared.f32 [%rd6+4608], %f165;
- shl.b32 %r212, %r4, 5;
- add.s32 %r213, %r7, %r9;
- add.s32 %r214, %r213, %r212;
- mad.lo.s32 %r215, %r214, %r67, %r10;
- add.s32 %r216, %r215, %r11;
- mul.wide.s32 %rd46, %r216, 4;
- add.s64 %rd40, %rd4, %rd46;
-
- ld.global.cs.v4.u32 {%r192,%r193,%r194,%r195}, [%rd40];
-
- add.s32 %r217, %r214, 16;
- mad.lo.s32 %r218, %r217, %r67, %r10;
- add.s32 %r219, %r218, %r11;
- mul.wide.s32 %rd47, %r219, 4;
- add.s64 %rd41, %rd4, %rd47;
-
- ld.global.cs.v4.u32 {%r196,%r197,%r198,%r199}, [%rd41];
-
- shl.b64 %rd48, %rd7, 2;
- add.s64 %rd49, %rd2, %rd48;
- ld.global.f32 %f166, [%rd49];
+ and.b32 %r220, %r81, -8;
+ sub.s32 %r221, %r2, %r220;
+ shl.b32 %r222, %r221, 2;
+ shl.b32 %r223, %r4, 5;
+ add.s32 %r224, %r7, %r9;
+ add.s32 %r225, %r224, %r223;
+ mad.lo.s32 %r226, %r225, %r67, %r222;
+ add.s32 %r227, %r226, %r11;
+ mul.wide.s32 %rd51, %r227, 4;
+ add.s64 %rd45, %rd4, %rd51;
+
+ ld.global.cs.v4.u32 {%r197,%r198,%r199,%r200}, [%rd45];
+
+ add.s32 %r228, %r225, 16;
+ mad.lo.s32 %r229, %r228, %r67, %r222;
+ add.s32 %r230, %r229, %r11;
+ mul.wide.s32 %rd52, %r230, 4;
+ add.s64 %rd46, %rd4, %rd52;
+
+ ld.global.cs.v4.u32 {%r201,%r202,%r203,%r204}, [%rd46];
+
+ shl.b64 %rd53, %rd7, 2;
+ add.s64 %rd54, %rd2, %rd53;
+ ld.global.f32 %f166, [%rd54];
st.shared.f32 [%rd6+4096], %f166;
- mul.lo.s32 %r220, %r4, 96;
- add.s32 %r221, %r214, %r220;
- mad.lo.s32 %r222, %r221, %r67, %r10;
- add.s32 %r223, %r222, %r11;
- mul.wide.s32 %rd50, %r223, 4;
- add.s64 %rd42, %rd3, %rd50;
-
- ld.global.cs.v4.u32 {%r200,%r201,%r202,%r203}, [%rd42];
-
- add.s32 %r224, %r217, %r220;
- mad.lo.s32 %r225, %r224, %r67, %r10;
- add.s32 %r226, %r225, %r11;
- mul.wide.s32 %rd51, %r226, 4;
- add.s64 %rd43, %rd3, %rd51;
-
- ld.global.cs.v4.u32 {%r204,%r205,%r206,%r207}, [%rd43];
-
- mul.lo.s32 %r227, %r13, %r62;
- mul.wide.s32 %rd52, %r227, 4;
- add.s64 %rd53, %rd1, %rd52;
- mul.wide.s32 %rd54, %r12, 4;
- add.s64 %rd56, %rd17, %rd54;
- ld.global.f32 %f167, [%rd53];
- st.shared.f32 [%rd56], %f167;
+ mul.lo.s32 %r231, %r4, 96;
+ add.s32 %r232, %r225, %r231;
+ mad.lo.s32 %r233, %r232, %r67, %r222;
+ add.s32 %r234, %r233, %r11;
+ mul.wide.s32 %rd55, %r234, 4;
+ add.s64 %rd47, %rd3, %rd55;
+
+ ld.global.cs.v4.u32 {%r205,%r206,%r207,%r208}, [%rd47];
+
+ add.s32 %r235, %r228, %r231;
+ mad.lo.s32 %r236, %r235, %r67, %r222;
+ add.s32 %r237, %r236, %r11;
+ mul.wide.s32 %rd56, %r237, 4;
+ add.s64 %rd48, %rd3, %rd56;
+
+ ld.global.cs.v4.u32 {%r209,%r210,%r211,%r212}, [%rd48];
+
+ mul.lo.s32 %r238, %r12, %r62;
+ mul.wide.s32 %rd57, %r238, 4;
+ add.s64 %rd58, %rd1, %rd57;
+ ld.global.f32 %f167, [%rd58];
+ st.shared.f32 [%rd6], %f167;
+ st.shared.f32 [%rd6+512], %f167;
+ st.shared.f32 [%rd6+1024], %f167;
+ st.shared.f32 [%rd6+1536], %f167;
+ st.shared.f32 [%rd6+2048], %f167;
+ st.shared.f32 [%rd6+2560], %f167;
+ st.shared.f32 [%rd6+3072], %f167;
+ st.shared.f32 [%rd6+3584], %f167;
barrier.sync 0;
- mul.wide.s32 %rd57, %r9, 4;
- add.s64 %rd58, %rd17, %rd57;
- ld.shared.f32 %f168, [%rd58];
- cvt.rn.f32.f64 %f169, %fd1;
- mul.f32 %f170, %f168, %f169;
- mov.b32 %f171, %r200;
- ld.shared.f32 %f172, [%rd58+4096];
- sub.f32 %f173, %f171, %f172;
- mov.b32 %f174, %r192;
- ld.shared.f32 %f175, [%rd58+4608];
- mul.f32 %f176, %f175, %f174;
- sub.f32 %f177, %f173, %f176;
- mul.f32 %f157, %f170, %f177;
- mov.b32 %f178, %r201;
- sub.f32 %f179, %f178, %f172;
- mov.b32 %f180, %r193;
- mul.f32 %f181, %f175, %f180;
+ shl.b32 %r239, %r221, 7;
+ add.s32 %r240, %r239, %r9;
+ shr.s32 %r241, %r9, 31;
+ shr.u32 %r242, %r241, 25;
+ add.s32 %r243, %r9, %r242;
+ and.b32 %r244, %r243, -128;
+ sub.s32 %r245, %r9, %r244;
+ mul.wide.s32 %rd59, %r245, 4;
+ add.s64 %rd61, %rd19, 4096;
+ add.s64 %rd62, %rd61, %rd59;
+ mov.b32 %f168, %r205;
+ ld.shared.f32 %f169, [%rd62];
+ sub.f32 %f170, %f168, %f169;
+ mov.b32 %f171, %r197;
+ ld.shared.f32 %f172, [%rd62+512];
+ mul.f32 %f173, %f172, %f171;
+ sub.f32 %f174, %f170, %f173;
+ mul.wide.s32 %rd63, %r240, 4;
+ add.s64 %rd64, %rd19, %rd63;
+ ld.shared.f32 %f175, [%rd64];
+ cvt.rn.f32.f64 %f176, %fd1;
+ mul.f32 %f177, %f175, %f176;
+ mul.f32 %f157, %f177, %f174;
+ mov.b32 %f178, %r206;
+ sub.f32 %f179, %f178, %f169;
+ mov.b32 %f180, %r198;
+ mul.f32 %f181, %f172, %f180;
sub.f32 %f182, %f179, %f181;
- mul.f32 %f158, %f170, %f182;
+ ld.shared.f32 %f183, [%rd64+128];
+ mul.f32 %f184, %f183, %f176;
+ mul.f32 %f158, %f184, %f182;
{ cvt.rn.bf16.f32 %rs22, %f158;}
{ cvt.rn.bf16.f32 %rs21, %f157;}
- mov.b32 %r208, {%rs21, %rs22};
- mov.b32 %f183, %r202;
- sub.f32 %f184, %f183, %f172;
- mov.b32 %f185, %r194;
- mul.f32 %f186, %f175, %f185;
- sub.f32 %f187, %f184, %f186;
- mul.f32 %f159, %f170, %f187;
- mov.b32 %f188, %r203;
- sub.f32 %f189, %f188, %f172;
- mov.b32 %f190, %r195;
- mul.f32 %f191, %f175, %f190;
- sub.f32 %f192, %f189, %f191;
- mul.f32 %f160, %f170, %f192;
+ mov.b32 %r213, {%rs21, %rs22};
+ mov.b32 %f185, %r207;
+ sub.f32 %f186, %f185, %f169;
+ mov.b32 %f187, %r199;
+ mul.f32 %f188, %f172, %f187;
+ sub.f32 %f189, %f186, %f188;
+ ld.shared.f32 %f190, [%rd64+256];
+ mul.f32 %f191, %f190, %f176;
+ mul.f32 %f159, %f191, %f189;
+ mov.b32 %f192, %r208;
+ sub.f32 %f193, %f192, %f169;
+ mov.b32 %f194, %r200;
+ mul.f32 %f195, %f172, %f194;
+ sub.f32 %f196, %f193, %f195;
+ ld.shared.f32 %f197, [%rd64+384];
+ mul.f32 %f198, %f197, %f176;
+ mul.f32 %f160, %f198, %f196;
{ cvt.rn.bf16.f32 %rs24, %f160;}
{ cvt.rn.bf16.f32 %rs23, %f159;}
- mov.b32 %r209, {%rs23, %rs24};
- ld.shared.f32 %f193, [%rd58+64];
- mul.f32 %f194, %f193, %f169;
- mov.b32 %f195, %r204;
- ld.shared.f32 %f196, [%rd58+4160];
- sub.f32 %f197, %f195, %f196;
- mov.b32 %f198, %r196;
- ld.shared.f32 %f199, [%rd58+4672];
- mul.f32 %f200, %f199, %f198;
- sub.f32 %f201, %f197, %f200;
- mul.f32 %f161, %f194, %f201;
- mov.b32 %f202, %r205;
- sub.f32 %f203, %f202, %f196;
- mov.b32 %f204, %r197;
- mul.f32 %f205, %f199, %f204;
- sub.f32 %f206, %f203, %f205;
- mul.f32 %f162, %f194, %f206;
+ mov.b32 %r214, {%rs23, %rs24};
+ shr.s32 %r246, %r10, 31;
+ shr.u32 %r247, %r246, 25;
+ add.s32 %r248, %r10, %r247;
+ and.b32 %r249, %r248, -128;
+ sub.s32 %r250, %r10, %r249;
+ mul.wide.s32 %rd65, %r250, 4;
+ add.s64 %rd66, %rd61, %rd65;
+ mov.b32 %f199, %r209;
+ ld.shared.f32 %f200, [%rd66];
+ sub.f32 %f201, %f199, %f200;
+ mov.b32 %f202, %r201;
+ ld.shared.f32 %f203, [%rd66+512];
+ mul.f32 %f204, %f203, %f202;
+ sub.f32 %f205, %f201, %f204;
+ ld.shared.f32 %f206, [%rd64+64];
+ mul.f32 %f207, %f206, %f176;
+ mul.f32 %f161, %f207, %f205;
+ mov.b32 %f208, %r210;
+ sub.f32 %f209, %f208, %f200;
+ mov.b32 %f210, %r202;
+ mul.f32 %f211, %f203, %f210;
+ sub.f32 %f212, %f209, %f211;
+ ld.shared.f32 %f213, [%rd64+192];
+ mul.f32 %f214, %f213, %f176;
+ mul.f32 %f162, %f214, %f212;
{ cvt.rn.bf16.f32 %rs26, %f162;}
{ cvt.rn.bf16.f32 %rs25, %f161;}
- mov.b32 %r210, {%rs25, %rs26};
- mov.b32 %f207, %r206;
- sub.f32 %f208, %f207, %f196;
- mov.b32 %f209, %r198;
- mul.f32 %f210, %f199, %f209;
- sub.f32 %f211, %f208, %f210;
- mul.f32 %f163, %f194, %f211;
- mov.b32 %f212, %r207;
- sub.f32 %f213, %f212, %f196;
- mov.b32 %f214, %r199;
- mul.f32 %f215, %f199, %f214;
- sub.f32 %f216, %f213, %f215;
- mul.f32 %f164, %f194, %f216;
+ mov.b32 %r215, {%rs25, %rs26};
+ mov.b32 %f215, %r211;
+ sub.f32 %f216, %f215, %f200;
+ mov.b32 %f217, %r203;
+ mul.f32 %f218, %f203, %f217;
+ sub.f32 %f219, %f216, %f218;
+ ld.shared.f32 %f220, [%rd64+320];
+ mul.f32 %f221, %f220, %f176;
+ mul.f32 %f163, %f221, %f219;
+ mov.b32 %f222, %r212;
+ sub.f32 %f223, %f222, %f200;
+ mov.b32 %f224, %r204;
+ mul.f32 %f225, %f203, %f224;
+ sub.f32 %f226, %f223, %f225;
+ ld.shared.f32 %f227, [%rd64+448];
+ mul.f32 %f228, %f227, %f176;
+ mul.f32 %f164, %f228, %f226;
{ cvt.rn.bf16.f32 %rs28, %f164;}
{ cvt.rn.bf16.f32 %rs27, %f163;}
- mov.b32 %r211, {%rs27, %rs28};
- mul.lo.s32 %r228, %r4, 896;
- add.s32 %r229, %r221, %r228;
- mad.lo.s32 %r230, %r229, %r67, %r10;
- add.s32 %r231, %r230, %r11;
- mul.wide.s32 %rd59, %r231, 2;
- add.s64 %rd44, %rd13, %rd59;
-
- st.global.cs.v2.s32 [%rd44], {%r208,%r209};
-
- add.s32 %r232, %r224, %r228;
- mad.lo.s32 %r233, %r232, %r67, %r10;
- add.s32 %r234, %r233, %r11;
- mul.wide.s32 %rd60, %r234, 2;
- add.s64 %rd45, %rd13, %rd60;
-
- st.global.cs.v2.s32 [%rd45], {%r210,%r211};
+ mov.b32 %r216, {%rs27, %rs28};
+ shl.b32 %r251, %r4, 10;
+ add.s32 %r252, %r224, %r251;
+ mad.lo.s32 %r253, %r252, %r67, %r222;
+ add.s32 %r254, %r253, %r11;
+ mul.wide.s32 %rd67, %r254, 2;
+ add.s64 %rd49, %rd15, %rd67;
+
+ st.global.cs.v2.s32 [%rd49], {%r213,%r214};
+
+ add.s32 %r255, %r252, 16;
+ mad.lo.s32 %r256, %r255, %r67, %r222;
+ add.s32 %r257, %r256, %r11;
+ mul.wide.s32 %rd68, %r257, 2;
+ add.s64 %rd50, %rd15, %rd68;
+
+ st.global.cs.v2.s32 [%rd50], {%r215,%r216};
bra.uni $L__BB0_80;
$L__BB0_7:
@%p3 bra $L__BB0_9;
ld.global.f32 %f82, [%rd8];
st.shared.f32 [%rd6+4608], %f82;
$L__BB0_9:
- mov.u32 %r14, %ctaid.x;
- add.s32 %r100, %r67, 31;
- shr.s32 %r101, %r100, 31;
- shr.u32 %r102, %r101, 27;
- add.s32 %r103, %r100, %r102;
- shr.s32 %r15, %r103, 5;
- shl.b32 %r16, %r4, 5;
- shr.s32 %r104, %r2, 31;
- shr.u32 %r105, %r104, 29;
- add.s32 %r106, %r2, %r105;
- and.b32 %r107, %r106, 1073741816;
- sub.s32 %r108, %r2, %r107;
- shl.b32 %r109, %r108, 2;
- rem.s32 %r110, %r14, %r15;
- shl.b32 %r111, %r110, 5;
- add.s32 %r20, %r111, %r109;
+ mov.u32 %r13, %ctaid.x;
+ add.s32 %r94, %r67, 31;
+ shr.s32 %r95, %r94, 31;
+ shr.u32 %r96, %r95, 27;
+ add.s32 %r97, %r94, %r96;
+ shr.s32 %r14, %r97, 5;
+ shl.b32 %r15, %r4, 5;
+ shr.s32 %r98, %r2, 31;
+ shr.u32 %r99, %r98, 29;
+ add.s32 %r100, %r2, %r99;
+ and.b32 %r101, %r100, -8;
+ sub.s32 %r16, %r2, %r101;
+ shl.b32 %r102, %r16, 2;
+ rem.s32 %r103, %r13, %r14;
+ shl.b32 %r104, %r103, 5;
+ add.s32 %r20, %r104, %r102;
or.b32 %r17, %r20, 3;
setp.ge.s32 %p8, %r17, %r67;
- shr.s32 %r18, %r106, 3;
+ shr.s32 %r18, %r100, 3;
add.s32 %r19, %r18, -216;
- mov.u32 %r235, 0;
- mov.u32 %r236, %r235;
- mov.u32 %r237, %r235;
- mov.u32 %r238, %r235;
+ mov.u32 %r258, 0;
+ mov.u32 %r259, %r258;
+ mov.u32 %r260, %r258;
+ mov.u32 %r261, %r258;
@%p8 bra $L__BB0_12;
- div.s32 %r116, %r14, %r15;
- shl.b32 %r21, %r116, 5;
- add.s32 %r117, %r19, %r21;
- neg.s32 %r118, %r16;
- setp.ge.s32 %p9, %r117, %r118;
+ div.s32 %r109, %r13, %r14;
+ shl.b32 %r21, %r109, 5;
+ add.s32 %r110, %r19, %r21;
+ neg.s32 %r111, %r15;
+ setp.ge.s32 %p9, %r110, %r111;
@%p9 bra $L__BB0_12;
- add.s32 %r123, %r16, %r18;
- add.s32 %r124, %r123, %r21;
- mad.lo.s32 %r125, %r124, %r67, %r20;
- mul.wide.s32 %rd21, %r125, 4;
- add.s64 %rd20, %rd4, %rd21;
-
- ld.global.cs.v4.u32 {%r238,%r237,%r236,%r235}, [%rd20];
+ add.s32 %r116, %r15, %r18;
+ add.s32 %r117, %r116, %r21;
+ mad.lo.s32 %r118, %r117, %r67, %r20;
+ mul.wide.s32 %rd23, %r118, 4;
+ add.s64 %rd22, %rd4, %rd23;
+
+ ld.global.cs.v4.u32 {%r261,%r260,%r259,%r258}, [%rd22];
$L__BB0_12:
- mov.f32 %f225, 0f00000000;
- mov.f32 %f226, 0f00000000;
- mov.f32 %f227, 0f00000000;
- mov.f32 %f228, 0f00000000;
+ mov.f32 %f237, 0f00000000;
+ mov.f32 %f238, 0f00000000;
+ mov.f32 %f239, 0f00000000;
+ mov.f32 %f240, 0f00000000;
@%p8 bra $L__BB0_15;
- div.s32 %r126, %r14, %r15;
- shl.b32 %r30, %r126, 5;
- add.s32 %r127, %r19, %r30;
- mov.u32 %r128, -16;
- sub.s32 %r129, %r128, %r16;
- setp.ge.s32 %p11, %r127, %r129;
+ div.s32 %r119, %r13, %r14;
+ shl.b32 %r30, %r119, 5;
+ add.s32 %r120, %r19, %r30;
+ mov.u32 %r121, -16;
+ sub.s32 %r122, %r121, %r15;
+ setp.ge.s32 %p11, %r120, %r122;
@%p11 bra $L__BB0_15;
- add.s32 %r134, %r16, %r18;
- add.s32 %r135, %r134, %r30;
- add.s32 %r136, %r135, 16;
- mad.lo.s32 %r137, %r136, %r67, %r20;
- mul.wide.s32 %rd23, %r137, 4;
- add.s64 %rd22, %rd4, %rd23;
-
- ld.global.cs.v4.u32 {%r130,%r131,%r132,%r133}, [%rd22];
-
- mov.b32 %f228, %r130;
- mov.b32 %f227, %r131;
- mov.b32 %f226, %r132;
- mov.b32 %f225, %r133;
+ add.s32 %r127, %r15, %r18;
+ add.s32 %r128, %r127, %r30;
+ add.s32 %r129, %r128, 16;
+ mad.lo.s32 %r130, %r129, %r67, %r20;
+ mul.wide.s32 %rd25, %r130, 4;
+ add.s64 %rd24, %rd4, %rd25;
+
+ ld.global.cs.v4.u32 {%r123,%r124,%r125,%r126}, [%rd24];
+
+ mov.b32 %f240, %r123;
+ mov.b32 %f239, %r124;
+ mov.b32 %f238, %r125;
+ mov.b32 %f237, %r126;
$L__BB0_15:
shl.b32 %r31, %r4, 7;
setp.gt.s32 %p12, %r2, 31;
@%p12 bra $L__BB0_18;
- div.s32 %r138, %r14, %r15;
- shl.b32 %r139, %r138, 5;
- add.s32 %r32, %r139, %r2;
+ div.s32 %r131, %r13, %r14;
+ shl.b32 %r132, %r131, 5;
+ add.s32 %r32, %r132, %r2;
setp.gt.s32 %p13, %r32, 215;
@%p13 bra $L__BB0_18;
- mul.wide.s32 %rd24, %r32, 4;
- add.s64 %rd25, %rd2, %rd24;
- ld.global.f32 %f91, [%rd25];
+ mul.wide.s32 %rd26, %r32, 4;
+ add.s64 %rd27, %rd2, %rd26;
+ ld.global.f32 %f91, [%rd27];
st.shared.f32 [%rd6+4096], %f91;
$L__BB0_18:
- mov.u32 %r239, 0;
- mov.u32 %r240, %r239;
- mov.u32 %r241, %r239;
- mov.u32 %r242, %r239;
+ mov.u32 %r262, 0;
+ mov.u32 %r263, %r262;
+ mov.u32 %r264, %r262;
+ mov.u32 %r265, %r262;
@%p8 bra $L__BB0_21;
- div.s32 %r148, %r14, %r15;
- shl.b32 %r33, %r148, 5;
- add.s32 %r149, %r19, %r33;
- neg.s32 %r150, %r31;
- setp.ge.s32 %p15, %r149, %r150;
+ div.s32 %r141, %r13, %r14;
+ shl.b32 %r33, %r141, 5;
+ add.s32 %r142, %r19, %r33;
+ neg.s32 %r143, %r31;
+ setp.ge.s32 %p15, %r142, %r143;
@%p15 bra $L__BB0_21;
- add.s32 %r155, %r31, %r18;
- add.s32 %r156, %r155, %r33;
- mad.lo.s32 %r157, %r156, %r67, %r20;
- mul.wide.s32 %rd27, %r157, 4;
- add.s64 %rd26, %rd3, %rd27;
-
- ld.global.cs.v4.u32 {%r242,%r241,%r240,%r239}, [%rd26];
+ add.s32 %r148, %r31, %r18;
+ add.s32 %r149, %r148, %r33;
+ mad.lo.s32 %r150, %r149, %r67, %r20;
+ mul.wide.s32 %rd29, %r150, 4;
+ add.s64 %rd28, %rd3, %rd29;
+
+ ld.global.cs.v4.u32 {%r265,%r264,%r263,%r262}, [%rd28];
$L__BB0_21:
setp.lt.s32 %p16, %r17, %r67;
- mov.f32 %f229, 0f00000000;
- mov.f32 %f230, 0f00000000;
- mov.f32 %f231, 0f00000000;
- mov.f32 %f232, 0f00000000;
+ mov.f32 %f241, 0f00000000;
+ mov.f32 %f242, 0f00000000;
+ mov.f32 %f243, 0f00000000;
+ mov.f32 %f244, 0f00000000;
@%p16 bra $L__BB0_22;
bra.uni $L__BB0_24;
$L__BB0_22:
- div.s32 %r158, %r14, %r15;
- shl.b32 %r42, %r158, 5;
- add.s32 %r159, %r19, %r42;
- mov.u32 %r160, -16;
- sub.s32 %r161, %r160, %r31;
- setp.ge.s32 %p17, %r159, %r161;
+ div.s32 %r151, %r13, %r14;
+ shl.b32 %r42, %r151, 5;
+ add.s32 %r152, %r19, %r42;
+ mov.u32 %r153, -16;
+ sub.s32 %r154, %r153, %r31;
+ setp.ge.s32 %p17, %r152, %r154;
@%p17 bra $L__BB0_24;
- add.s32 %r166, %r31, %r18;
- add.s32 %r167, %r166, %r42;
- add.s32 %r168, %r167, 16;
- mad.lo.s32 %r169, %r168, %r67, %r20;
- mul.wide.s32 %rd29, %r169, 4;
- add.s64 %rd28, %rd3, %rd29;
-
- ld.global.cs.v4.u32 {%r162,%r163,%r164,%r165}, [%rd28];
-
- mov.b32 %f232, %r162;
- mov.b32 %f231, %r163;
- mov.b32 %f230, %r164;
- mov.b32 %f229, %r165;
+ add.s32 %r159, %r31, %r18;
+ add.s32 %r160, %r159, %r42;
+ add.s32 %r161, %r160, 16;
+ mad.lo.s32 %r162, %r161, %r67, %r20;
+ mul.wide.s32 %rd31, %r162, 4;
+ add.s64 %rd30, %rd3, %rd31;
+
+ ld.global.cs.v4.u32 {%r155,%r156,%r157,%r158}, [%rd30];
+
+ mov.b32 %f244, %r155;
+ mov.b32 %f243, %r156;
+ mov.b32 %f242, %r157;
+ mov.b32 %f241, %r158;
$L__BB0_24:
- div.s32 %r170, %r14, %r15;
- shl.b32 %r43, %r170, 5;
- shr.u32 %r172, %r104, 27;
- add.s32 %r173, %r2, %r172;
- and.b32 %r174, %r173, -32;
- sub.s32 %r44, %r2, %r174;
- add.s32 %r175, %r43, %r44;
- setp.gt.s32 %p18, %r175, 215;
- mul.lo.s32 %r176, %r175, %r62;
- mul.wide.s32 %rd30, %r176, 4;
- add.s64 %rd9, %rd1, %rd30;
+ div.s32 %r163, %r13, %r14;
+ shl.b32 %r43, %r163, 5;
+ shr.u32 %r165, %r98, 27;
+ add.s32 %r166, %r2, %r165;
+ and.b32 %r167, %r166, -32;
+ sub.s32 %r168, %r2, %r167;
+ add.s32 %r169, %r43, %r168;
+ setp.gt.s32 %p18, %r169, 215;
+ mul.lo.s32 %r170, %r169, %r62;
+ mul.wide.s32 %rd32, %r170, 4;
+ add.s64 %rd9, %rd1, %rd32;
@%p18 bra $L__BB0_26;
- mul.wide.s32 %rd31, %r44, 4;
- add.s64 %rd33, %rd17, %rd31;
ld.global.f32 %f100, [%rd9];
- st.shared.f32 [%rd33], %f100;
+ st.shared.f32 [%rd6], %f100;
+ st.shared.f32 [%rd6+512], %f100;
+ st.shared.f32 [%rd6+1024], %f100;
+ st.shared.f32 [%rd6+1536], %f100;
+ st.shared.f32 [%rd6+2048], %f100;
+ st.shared.f32 [%rd6+2560], %f100;
+ st.shared.f32 [%rd6+3072], %f100;
+ st.shared.f32 [%rd6+3584], %f100;
$L__BB0_26:
- shl.b32 %r45, %r4, 9;
+ shl.b32 %r44, %r4, 9;
barrier.sync 0;
cvt.rn.f32.f64 %f17, %fd1;
- neg.s32 %r46, %r45;
- add.s32 %r47, %r19, %r43;
- setp.ge.s32 %p19, %r47, %r46;
- mul.wide.s32 %rd34, %r18, 4;
- add.s64 %rd10, %rd17, %rd34;
- mov.f32 %f234, 0f00000000;
- mov.f32 %f233, %f234;
+ neg.s32 %r45, %r44;
+ add.s32 %r46, %r19, %r43;
+ setp.ge.s32 %p19, %r46, %r45;
+ shr.s32 %r171, %r18, 31;
+ shr.u32 %r172, %r171, 25;
+ add.s32 %r173, %r18, %r172;
+ and.b32 %r174, %r173, -128;
+ sub.s32 %r175, %r18, %r174;
+ mul.wide.s32 %rd33, %r175, 4;
+ add.s64 %rd35, %rd19, %rd33;
+ add.s64 %rd10, %rd35, 4096;
+ mov.f32 %f246, 0f00000000;
+ mov.f32 %f245, %f246;
@%p19 bra $L__BB0_28;
- ld.shared.f32 %f233, [%rd10+4096];
+ ld.shared.f32 %f245, [%rd10];
$L__BB0_28:
- mov.b32 %f103, %r242;
- sub.f32 %f20, %f103, %f233;
+ mov.b32 %f103, %r265;
+ sub.f32 %f20, %f103, %f245;
@%p19 bra $L__BB0_30;
- ld.shared.f32 %f234, [%rd10+4608];
+ ld.shared.f32 %f246, [%rd10+512];
$L__BB0_30:
- mov.b32 %f105, %r238;
- mul.f32 %f106, %f234, %f105;
+ mov.b32 %f105, %r261;
+ mul.f32 %f106, %f246, %f105;
sub.f32 %f23, %f20, %f106;
- mov.f32 %f236, 0f00000000;
- mov.f32 %f235, %f236;
+ shl.b32 %r176, %r16, 7;
+ add.s32 %r177, %r176, %r18;
+ mul.wide.s32 %rd36, %r177, 4;
+ add.s64 %rd11, %rd19, %rd36;
+ mov.f32 %f248, 0f00000000;
+ mov.f32 %f247, %f248;
@%p19 bra $L__BB0_32;
- ld.shared.f32 %f107, [%rd10];
- mul.f32 %f235, %f107, %f17;
+ ld.shared.f32 %f107, [%rd11];
+ mul.f32 %f247, %f107, %f17;
$L__BB0_32:
- mul.f32 %f108, %f23, %f235;
+ mul.f32 %f108, %f23, %f247;
{ cvt.rn.bf16.f32 %rs13, %f108;}
@%p19 bra $L__BB0_34;
- ld.shared.f32 %f236, [%rd10+4096];
+ ld.shared.f32 %f248, [%rd10];
$L__BB0_34:
- mov.b32 %f111, %r241;
- sub.f32 %f28, %f111, %f236;
- mov.f32 %f238, 0f00000000;
- mov.f32 %f237, %f238;
+ mov.b32 %f111, %r264;
+ sub.f32 %f28, %f111, %f248;
+ mov.f32 %f250, 0f00000000;
+ mov.f32 %f249, %f250;
@%p19 bra $L__BB0_36;
- ld.shared.f32 %f237, [%rd10+4608];
+ ld.shared.f32 %f249, [%rd10+512];
$L__BB0_36:
- mov.b32 %f113, %r237;
- mul.f32 %f114, %f237, %f113;
+ mov.b32 %f113, %r260;
+ mul.f32 %f114, %f249, %f113;
sub.f32 %f31, %f28, %f114;
@%p19 bra $L__BB0_38;
- ld.shared.f32 %f115, [%rd10];
- mul.f32 %f238, %f115, %f17;
+ ld.shared.f32 %f115, [%rd11+128];
+ mul.f32 %f250, %f115, %f17;
$L__BB0_38:
- mul.f32 %f116, %f31, %f238;
+ mul.f32 %f116, %f31, %f250;
{ cvt.rn.bf16.f32 %rs14, %f116;}
- mov.f32 %f240, 0f00000000;
- mov.f32 %f239, %f240;
+ mov.f32 %f252, 0f00000000;
+ mov.f32 %f251, %f252;
@%p19 bra $L__BB0_40;
- ld.shared.f32 %f239, [%rd10+4096];
+ ld.shared.f32 %f251, [%rd10];
$L__BB0_40:
- mov.b32 %f119, %r240;
- sub.f32 %f36, %f119, %f239;
+ mov.b32 %f119, %r263;
+ sub.f32 %f36, %f119, %f251;
@%p19 bra $L__BB0_42;
- ld.shared.f32 %f240, [%rd10+4608];
+ ld.shared.f32 %f252, [%rd10+512];
$L__BB0_42:
- mov.b32 %f121, %r236;
- mul.f32 %f122, %f240, %f121;
+ mov.b32 %f121, %r259;
+ mul.f32 %f122, %f252, %f121;
sub.f32 %f39, %f36, %f122;
- mov.f32 %f242, 0f00000000;
- mov.f32 %f241, %f242;
+ mov.f32 %f254, 0f00000000;
+ mov.f32 %f253, %f254;
@%p19 bra $L__BB0_44;
- ld.shared.f32 %f123, [%rd10];
- mul.f32 %f241, %f123, %f17;
+ ld.shared.f32 %f123, [%rd11+256];
+ mul.f32 %f253, %f123, %f17;
$L__BB0_44:
- mul.f32 %f124, %f39, %f241;
+ mul.f32 %f124, %f39, %f253;
{ cvt.rn.bf16.f32 %rs15, %f124;}
@%p19 bra $L__BB0_46;
- ld.shared.f32 %f242, [%rd10+4096];
+ ld.shared.f32 %f254, [%rd10];
$L__BB0_46:
- mov.b32 %f127, %r239;
- sub.f32 %f44, %f127, %f242;
- mov.f32 %f244, 0f00000000;
- mov.f32 %f243, %f244;
+ mov.b32 %f127, %r262;
+ sub.f32 %f44, %f127, %f254;
+ mov.f32 %f256, 0f00000000;
+ mov.f32 %f255, %f256;
@%p19 bra $L__BB0_48;
- ld.shared.f32 %f243, [%rd10+4608];
+ ld.shared.f32 %f255, [%rd10+512];
$L__BB0_48:
- mov.b32 %f129, %r235;
- mul.f32 %f130, %f243, %f129;
+ mov.b32 %f129, %r258;
+ mul.f32 %f130, %f255, %f129;
sub.f32 %f47, %f44, %f130;
@%p19 bra $L__BB0_50;
- ld.shared.f32 %f131, [%rd10];
- mul.f32 %f244, %f131, %f17;
+ ld.shared.f32 %f131, [%rd11+384];
+ mul.f32 %f256, %f131, %f17;
$L__BB0_50:
- mul.f32 %f132, %f47, %f244;
+ mul.f32 %f132, %f47, %f256;
{ cvt.rn.bf16.f32 %rs16, %f132;}
- mov.u32 %r177, -16;
- sub.s32 %r48, %r177, %r45;
- setp.ge.s32 %p31, %r47, %r48;
- mov.f32 %f246, 0f00000000;
- mov.f32 %f245, %f246;
+ mov.u32 %r178, -16;
+ sub.s32 %r47, %r178, %r44;
+ setp.ge.s32 %p31, %r46, %r47;
+ add.s32 %r48, %r18, 16;
+ shr.s32 %r179, %r48, 31;
+ shr.u32 %r180, %r179, 25;
+ add.s32 %r181, %r48, %r180;
+ and.b32 %r182, %r181, -128;
+ sub.s32 %r183, %r48, %r182;
+ mul.wide.s32 %rd38, %r183, 4;
+ add.s64 %rd40, %rd19, %rd38;
+ add.s64 %rd12, %rd40, 4096;
+ mov.f32 %f258, 0f00000000;
+ mov.f32 %f257, %f258;
@%p31 bra $L__BB0_52;
- ld.shared.f32 %f245, [%rd10+4160];
+ ld.shared.f32 %f257, [%rd12];
$L__BB0_52:
- sub.f32 %f52, %f232, %f245;
+ sub.f32 %f52, %f244, %f257;
@%p31 bra $L__BB0_54;
- ld.shared.f32 %f246, [%rd10+4672];
+ ld.shared.f32 %f258, [%rd12+512];
$L__BB0_54:
- mul.f32 %f136, %f246, %f228;
+ mul.f32 %f136, %f258, %f240;
sub.f32 %f55, %f52, %f136;
- mov.f32 %f248, 0f00000000;
- mov.f32 %f247, %f248;
+ mov.f32 %f260, 0f00000000;
+ mov.f32 %f259, %f260;
@%p31 bra $L__BB0_56;
- ld.shared.f32 %f137, [%rd10+64];
- mul.f32 %f247, %f137, %f17;
+ ld.shared.f32 %f137, [%rd11+64];
+ mul.f32 %f259, %f137, %f17;
$L__BB0_56:
- mul.f32 %f138, %f55, %f247;
+ mul.f32 %f138, %f55, %f259;
{ cvt.rn.bf16.f32 %rs17, %f138;}
@%p31 bra $L__BB0_58;
- ld.shared.f32 %f248, [%rd10+4160];
+ ld.shared.f32 %f260, [%rd12];
$L__BB0_58:
- sub.f32 %f60, %f231, %f248;
- mov.f32 %f250, 0f00000000;
- mov.f32 %f249, %f250;
+ sub.f32 %f60, %f243, %f260;
+ mov.f32 %f262, 0f00000000;
+ mov.f32 %f261, %f262;
@%p31 bra $L__BB0_60;
- ld.shared.f32 %f249, [%rd10+4672];
+ ld.shared.f32 %f261, [%rd12+512];
$L__BB0_60:
- mul.f32 %f142, %f249, %f227;
+ mul.f32 %f142, %f261, %f239;
sub.f32 %f63, %f60, %f142;
@%p31 bra $L__BB0_62;
- ld.shared.f32 %f143, [%rd10+64];
- mul.f32 %f250, %f143, %f17;
+ ld.shared.f32 %f143, [%rd11+192];
+ mul.f32 %f262, %f143, %f17;
$L__BB0_62:
- mul.f32 %f144, %f63, %f250;
+ mul.f32 %f144, %f63, %f262;
{ cvt.rn.bf16.f32 %rs18, %f144;}
- mov.f32 %f252, 0f00000000;
- mov.f32 %f251, %f252;
+ mov.f32 %f264, 0f00000000;
+ mov.f32 %f263, %f264;
@%p31 bra $L__BB0_64;
- ld.shared.f32 %f251, [%rd10+4160];
+ ld.shared.f32 %f263, [%rd12];
$L__BB0_64:
- sub.f32 %f68, %f230, %f251;
+ sub.f32 %f68, %f242, %f263;
@%p31 bra $L__BB0_66;
- ld.shared.f32 %f252, [%rd10+4672];
+ ld.shared.f32 %f264, [%rd12+512];
$L__BB0_66:
- mul.f32 %f148, %f252, %f226;
+ mul.f32 %f148, %f264, %f238;
sub.f32 %f71, %f68, %f148;
- mov.f32 %f254, 0f00000000;
- mov.f32 %f253, %f254;
+ mov.f32 %f266, 0f00000000;
+ mov.f32 %f265, %f266;
@%p31 bra $L__BB0_68;
- ld.shared.f32 %f149, [%rd10+64];
- mul.f32 %f253, %f149, %f17;
+ ld.shared.f32 %f149, [%rd11+320];
+ mul.f32 %f265, %f149, %f17;
$L__BB0_68:
- mul.f32 %f150, %f71, %f253;
+ mul.f32 %f150, %f71, %f265;
{ cvt.rn.bf16.f32 %rs19, %f150;}
@%p31 bra $L__BB0_70;
- ld.shared.f32 %f254, [%rd10+4160];
+ ld.shared.f32 %f266, [%rd12];
$L__BB0_70:
- sub.f32 %f76, %f229, %f254;
- mov.f32 %f256, 0f00000000;
- mov.f32 %f255, %f256;
+ sub.f32 %f76, %f241, %f266;
+ mov.f32 %f268, 0f00000000;
+ mov.f32 %f267, %f268;
@%p31 bra $L__BB0_72;
- ld.shared.f32 %f255, [%rd10+4672];
+ ld.shared.f32 %f267, [%rd12+512];
$L__BB0_72:
- mul.f32 %f154, %f255, %f225;
+ mul.f32 %f154, %f267, %f237;
sub.f32 %f79, %f76, %f154;
@%p31 bra $L__BB0_74;
- ld.shared.f32 %f155, [%rd10+64];
- mul.f32 %f256, %f155, %f17;
+ ld.shared.f32 %f155, [%rd11+448];
+ mul.f32 %f268, %f155, %f17;
$L__BB0_74:
- mul.f32 %f156, %f79, %f256;
+ mul.f32 %f156, %f79, %f268;
{ cvt.rn.bf16.f32 %rs20, %f156;}
shl.b32 %r49, %r4, 10;
@%p8 bra $L__BB0_77;
- neg.s32 %r178, %r49;
- setp.ge.s32 %p44, %r47, %r178;
+ neg.s32 %r184, %r49;
+ setp.ge.s32 %p44, %r46, %r184;
@%p44 bra $L__BB0_77;
- add.s32 %r181, %r49, %r18;
- add.s32 %r182, %r181, %r43;
- mad.lo.s32 %r183, %r182, %r67, %r20;
- mul.wide.s32 %rd37, %r183, 2;
- add.s64 %rd36, %rd13, %rd37;
- mov.b32 %r180, {%rs15, %rs16};
- mov.b32 %r179, {%rs13, %rs14};
-
- st.global.cs.v2.s32 [%rd36], {%r179,%r180};
+ add.s32 %r187, %r49, %r18;
+ add.s32 %r188, %r187, %r43;
+ mad.lo.s32 %r189, %r188, %r67, %r20;
+ mul.wide.s32 %rd42, %r189, 2;
+ add.s64 %rd41, %rd15, %rd42;
+ mov.b32 %r186, {%rs15, %rs16};
+ mov.b32 %r185, {%rs13, %rs14};
+
+ st.global.cs.v2.s32 [%rd41], {%r185,%r186};
$L__BB0_77:
- mov.u32 %r184, -16;
- sub.s32 %r185, %r184, %r49;
- setp.ge.s32 %p46, %r47, %r185;
+ mov.u32 %r190, -16;
+ sub.s32 %r191, %r190, %r49;
+ setp.ge.s32 %p46, %r46, %r191;
or.pred %p47, %p8, %p46;
@%p47 bra $L__BB0_80;
- add.s32 %r188, %r49, %r18;
- add.s32 %r189, %r188, %r43;
- add.s32 %r190, %r189, 16;
- mad.lo.s32 %r191, %r190, %r67, %r20;
- mul.wide.s32 %rd39, %r191, 2;
- add.s64 %rd38, %rd13, %rd39;
- mov.b32 %r187, {%rs19, %rs20};
- mov.b32 %r186, {%rs17, %rs18};
-
- st.global.cs.v2.s32 [%rd38], {%r186,%r187};
+ add.s32 %r194, %r48, %r49;
+ add.s32 %r195, %r194, %r43;
+ mad.lo.s32 %r196, %r195, %r67, %r20;
+ mul.wide.s32 %rd44, %r196, 2;
+ add.s64 %rd43, %rd15, %rd44;
+ mov.b32 %r193, {%rs19, %rs20};
+ mov.b32 %r192, {%rs17, %rs18};
+
+ st.global.cs.v2.s32 [%rd43], {%r192,%r193};
$L__BB0_80:
ret;
25: CombinedSchedulerTest.SharedProducer
Kernel 2
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-16
+16 index type: int
registers: 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T23, Tensor<float, 1, 1> T22, Tensor<float, 2, 2> T40, Tensor<float, 2, 2> T24, Tensor<float, 2, 2> T54, Tensor<float, 2, 2> T59, Tensor<int64_t, 1, 1> T64) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T37 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T36 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T33 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T54
// Allocate global tensor T59
__syncthreads();
Array<float, 4, 4> T55;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T55[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T60;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T60[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T53;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T58;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T58[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T36) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T43;
T43.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8]
= T53[i8]
+ T43[i8];
}
} else {
Array<float, 4, 4> T43;
T43.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8]
= T53[i8]
+ T43[i8];
}
}
Array<float, 1, 1> T34;
T34[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T34[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T34[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T33) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T35;
T35[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T35[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T52;
T52[0] = 0.000000000e+00f;
Array<float, 1, 1> T63;
T63[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T47[i9]
- T34[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T35[0];
Array<float, 1, 1> T21;
T21[0]
= T44[i9]
* T7[0];
T58[i9]
= T58[i9]
+ T21[0];
Array<float, 1, 1> T8;
T8[0]
= T51[i9];
Array<float, 1, 1> T9;
T9[0]
= T46[i9]
* T8[0];
T52[0]
= T52[0]
+ T9[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T63[0]
= T63[0]
+ T13[0];
}
} else {
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T47[i9]
- T34[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T35[0];
Array<float, 1, 1> T21;
T21[0]
= T44[i9]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T58[i9]
= T58[i9]
+ T21[0];
}
Array<float, 1, 1> T8;
T8[0]
= T51[i9];
Array<float, 1, 1> T9;
T9[0]
= T46[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T52[0]
= T52[0]
+ T9[0];
}
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T63[0]
= T63[0]
+ T13[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T52[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T63[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T49;
T49.set(float(0));
loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T41 = T45;
// Alias Allocation - register
auto& T42 = T48;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T30;
T30[0]
= T50[i11];
Array<float, 1, 1> T31;
T31[0]
= T45[i11]
* T30[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T31[0];
Array<float, 1, 1> T27;
T27[0]
= T48[i11]
- T34[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T35[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T41[i11]
= T20[0];
T42[i11]
= T20[0]
+ T49[i11];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T40[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T41[0]);
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T42[0]);
} else {
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T50;
T50.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T50[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T41 = T45;
// Alias Allocation - register
auto& T42 = T48;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T30;
T30[0]
= T50[i11];
Array<float, 1, 1> T31;
T31[0]
= T45[i11]
* T30[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T31[0];
Array<float, 1, 1> T27;
T27[0]
= T48[i11]
- T34[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T35[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T41[i11]
= T20[0];
T42[i11]
= T20[0]
+ T49[i11];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T40[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T41[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T42[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T55[i6], T53[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T60[i7], T58[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T54[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T55[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T59[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T60[0]);
}
}
// Allocate global tensor T64
grid_sync::sync<false, true, false, true, true>(T64[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T62;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T62[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T61;
T61.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T61[0], &*(volatile float*)&T59[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T62[i12]
= T62[i12]
+ T61[i12];
}
}
Array<float, 2, 2> T39;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T39[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T39[i14], T62[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T39[0]);
}
Array<float, 2, 1> T57;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T57[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T56;
T56.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T56[0], &*(volatile float*)&T54[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T57[i15]
= T57[i15]
+ T56[i15];
}
}
Array<float, 2, 2> T38;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T38[i17] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T38[i17], T57[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T38[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T23, Tensor<float, 1, 1> T22, Tensor<float, 2, 2> T40, Tensor<float, 2, 2> T24, Tensor<float, 2, 2> T54, Tensor<float, 2, 2> T59, Tensor<int64_t, 1, 1> T64) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T37 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T36 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T33 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T54
// Allocate global tensor T59
__syncthreads();
Array<float, 4, 4> T55;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T55[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T60;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T60[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T53;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T58;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T58[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T36) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T43;
T43.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8]
= T53[i8]
+ T43[i8];
}
} else {
Array<float, 4, 4> T43;
T43.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8]
= T53[i8]
+ T43[i8];
}
}
Array<float, 1, 1> T34;
T34[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T34[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T34[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T33) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T35;
T35[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T35[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T52;
T52[0] = 0.000000000e+00f;
Array<float, 1, 1> T63;
T63[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T44;
T44.set(float(0));
loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T47[i9]
- T34[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T35[0];
Array<float, 1, 1> T21;
T21[0]
= T44[i9]
* T7[0];
T58[i9]
= T58[i9]
+ T21[0];
Array<float, 1, 1> T8;
T8[0]
= T51[i9];
Array<float, 1, 1> T9;
T9[0]
= T46[i9]
* T8[0];
T52[0]
= T52[0]
+ T9[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T63[0]
= T63[0]
+ T13[0];
}
} else {
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T47[i9]
- T34[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T35[0];
Array<float, 1, 1> T21;
T21[0]
= T44[i9]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T58[i9]
= T58[i9]
+ T21[0];
}
Array<float, 1, 1> T8;
T8[0]
= T51[i9];
Array<float, 1, 1> T9;
T9[0]
= T46[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T52[0]
= T52[0]
+ T9[0];
}
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T63[0]
= T63[0]
+ T13[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T52[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T63[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T49;
T49.set(float(0));
loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T45;
T45.set(float(0));
loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T41 = T45;
// Alias Allocation - register
auto& T42 = T48;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T30;
T30[0]
= T50[i11];
Array<float, 1, 1> T31;
T31[0]
= T45[i11]
* T30[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T31[0];
Array<float, 1, 1> T27;
T27[0]
= T48[i11]
- T34[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T35[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T41[i11]
= T20[0];
T42[i11]
= T20[0]
+ T49[i11];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T40[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T41[0]);
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T42[0]);
} else {
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T50;
T50.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T50[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T41 = T45;
// Alias Allocation - register
auto& T42 = T48;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T30;
T30[0]
= T50[i11];
Array<float, 1, 1> T31;
T31[0]
= T45[i11]
* T30[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T31[0];
Array<float, 1, 1> T27;
T27[0]
= T48[i11]
- T34[0];
Array<float, 1, 1> T28;
T28[0]
= T27[0]
* T35[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T28[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T41[i11]
= T20[0];
T42[i11]
= T20[0]
+ T49[i11];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T40[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T41[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T42[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T55[i6], T53[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T60[i7], T58[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T54[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T55[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T59[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T60[0]);
}
}
// Allocate global tensor T64
grid_sync::sync<false, true, false, true, true>(T64[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T62;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T62[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T61;
T61.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T61[0], &*(volatile float*)&T59[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T62[i12]
= T62[i12]
+ T61[i12];
}
}
Array<float, 2, 2> T39;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T39[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T39[i14], T62[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T39[0]);
}
Array<float, 2, 1> T57;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T57[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T56;
T56.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T56[0], &*(volatile float*)&T54[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T57[i15]
= T57[i15]
+ T56[i15];
}
}
Array<float, 2, 2> T38;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
T38[i17] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
blockReduce<true, false, false, true>(T38[i17], T57[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T38[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T36) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T36) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T43;
T43.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8]
= T53[i8]
+ T43[i8];
}
} else {
Array<float, 4, 4> T43;
T43.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T43[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T53[i8]
= T53[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T33) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T33) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -152,17 +152,17 @@
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T46;
T46.set(float(0));
- loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T47;
T47.set(float(0));
- loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T44;
T44.set(float(0));
- loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T47[i9]
@@ -203,21 +203,21 @@
loadGeneric<float, 4>( &T51[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T44;
T44.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T44[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
@@ -266,20 +266,20 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T49;
T49.set(float(0));
- loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T48;
T48.set(float(0));
- loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T45;
T45.set(float(0));
- loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T41 = T45;
// Alias Allocation - register
auto& T42 = T48;
#pragma unroll
@@ -329,26 +329,26 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T24[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T42[0]);
} else {
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T33[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T50;
T50.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T50[0], &T37[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T45;
T45.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T36[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T41 = T45;
// Alias Allocation - register
auto& T42 = T48;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<394>;
.reg .b32 %r<471>;
.reg .f64 %fd<3>;
.reg .b64 %rd<159>;
ld.param.v2.u32 {%r170, %r171}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
ld.param.v2.u32 {%r174, %r175}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r176, %r177}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3+8];
ld.param.u64 %rd43, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r204, %r177, 3;
shr.s32 %r205, %r204, 31;
shr.u32 %r206, %r205, 30;
add.s32 %r207, %r204, %r206;
shr.s32 %r208, %r207, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r209, %r208, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r210, %r3, 2;
mad.lo.s32 %r211, %r210, %r209, 15;
and.b32 %r212, %r211, -16;
cvt.u64.u32 %rd1, %r212;
mul.lo.s32 %r213, %r3, %r208;
shl.b32 %r214, %r213, 4;
or.b32 %r215, %r214, 15;
and.b32 %r4, %r215, -16;
add.s32 %r216, %r215, %r4;
and.b32 %r217, %r216, -16;
cvt.s64.s32 %rd2, %r217;
mov.u64 %rd45, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_103395arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p9, %r5, %r208;
shl.b32 %r6, %r5, 2;
or.b32 %r218, %r6, 3;
setp.lt.s32 %p10, %r218, %r177;
and.pred %p1, %p10, %p9;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p11, %r7, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r219, smem_ptr; }
// end inline asm
shl.b32 %r222, %r5, 4;
add.s32 %r220, %r219, %r222;
mul.wide.s32 %rd49, %r6, 4;
add.s64 %rd48, %rd37, %rd49;
mov.u32 %r221, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r221, 0;
cp.async.ca.shared.global [%r220], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r223, %r3, 63;
div.s32 %r224, %r223, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r225, %r8, %r224;
add.s32 %r226, %r225, -1;
div.s32 %r9, %r226, %r8;
setp.gt.s32 %p13, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r177;
cvt.s64.s32 %rd50, %r4;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r228, %ctaid.y;
mul.lo.s32 %r229, %r9, %r3;
mul.lo.s32 %r10, %r229, %r228;
shl.b32 %r230, %r7, 2;
shl.b32 %r231, %r5, 4;
mad.lo.s32 %r11, %r230, %r177, %r231;
mul.lo.s32 %r232, %r177, %r7;
cvt.s64.s32 %rd54, %r232;
cvt.s64.s32 %rd55, %r6;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r233, %r10, %r177;
cvt.s64.s32 %rd6, %r233;
mul.lo.s32 %r12, %r177, %r3;
mul.lo.s32 %r13, %r9, %r228;
add.s32 %r14, %r232, %r6;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r14, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r234, %tid.z;
mad.lo.s32 %r235, %r3, %r234, %r7;
mad.lo.s32 %r15, %r235, %r2, %r5;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r236, %r2;
mov.u32 %r237, 31;
sub.s32 %r238, %r237, %r236;
mov.u32 %r239, 1;
shl.b32 %r16, %r239, %r238;
setp.lt.u32 %p14, %r5, %r16;
add.s32 %r240, %r16, %r5;
setp.lt.u32 %p15, %r240, %r2;
and.pred %p3, %p14, %p15;
add.s32 %r241, %r15, %r16;
mul.wide.s32 %rd59, %r241, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r242, %r16, 31;
add.s32 %r243, %r16, %r242;
shr.s32 %r17, %r243, 1;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r6, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r244, %r15, 1;
mul.wide.u32 %rd62, %r244, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
mul.wide.s32 %rd63, %r235, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd34;
cvta.to.global.u64 %rd16, %rd35;
add.s64 %rd20, %rd46, %rd51;
mov.u32 %r429, 0;
mov.f32 %f358, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd20; cvt.u32.u64 %r247, smem_ptr; }
// end inline asm
add.s32 %r248, %r11, %r247;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r273, smem_ptr; }
// end inline asm
add.s32 %r274, %r11, %r273;
not.pred %p26, %p3;
mov.f32 %f359, %f358;
mov.f32 %f360, %f358;
mov.f32 %f361, %f358;
mov.f32 %f370, %f358;
mov.f32 %f371, %f358;
mov.f32 %f372, %f358;
mov.f32 %f373, %f358;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r245, %r429, %r3, %r7;
add.s32 %r246, %r245, %r10;
setp.gt.s32 %p17, %r246, 63;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r250, %r12, %r429;
cvt.s64.s32 %rd67, %r250;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd36, %rd70;
mov.u32 %r249, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r249, 0;
cp.async.ca.shared.global [%r248], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r251, %r13, %r429;
mad.lo.s32 %r252, %r251, %r3, %r7;
setp.lt.s32 %p19, %r252, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r430, %r431, %r432, %r433}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r261, %r13, %r429;
mad.lo.s32 %r262, %r261, %r3, %r7;
setp.gt.s32 %p20, %r262, 63;
mov.u32 %r430, 0;
mov.u32 %r431, %r430;
mov.u32 %r432, %r430;
mov.u32 %r433, %r430;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r430, %r431, %r432, %r433}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r430, 0;
mov.u32 %r431, %r430;
mov.u32 %r432, %r430;
mov.u32 %r433, %r430;
$L__BB0_15:
add.s32 %r271, %r13, %r429;
mad.lo.s32 %r33, %r271, %r3, %r7;
mov.b32 %f112, %r433;
add.f32 %f373, %f373, %f112;
mov.b32 %f113, %r432;
add.f32 %f372, %f372, %f113;
mov.b32 %f114, %r431;
add.f32 %f371, %f371, %f114;
mov.b32 %f115, %r430;
add.f32 %f370, %f370, %f115;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f356, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r272, %r33, %r170;
mul.wide.s32 %rd71, %r272, 4;
add.s64 %rd72, %rd15, %rd71;
ld.global.f32 %f356, [%rd72];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r276, %r12, %r429;
cvt.s64.s32 %rd75, %r276;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd33, %rd78;
mov.u32 %r275, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r275, 0;
cp.async.ca.shared.global [%r274], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r428, %r13, %r429;
mad.lo.s32 %r427, %r428, %r3, %r7;
setp.gt.s32 %p142, %r427, 63;
mov.f32 %f362, 0f00000000;
mov.f32 %f357, %f362;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r277, %r33, %r174;
mul.wide.s32 %rd79, %r277, 4;
add.s64 %rd80, %rd16, %rd79;
ld.global.f32 %f357, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f363, %f362;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
sub.f32 %f124, %f119, %f356;
mul.f32 %f125, %f357, %f124;
ld.shared.v4.f32 {%f126, %f127, %f128, %f129}, [%rd7];
fma.rn.f32 %f358, %f125, %f126, %f358;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
mul.f32 %f136, %f131, %f126;
add.f32 %f137, %f136, 0f00000000;
fma.rn.f32 %f138, %f125, %f136, 0f00000000;
sub.f32 %f140, %f120, %f356;
mul.f32 %f141, %f357, %f140;
fma.rn.f32 %f359, %f141, %f127, %f359;
mul.f32 %f144, %f132, %f127;
add.f32 %f145, %f137, %f144;
fma.rn.f32 %f146, %f141, %f144, %f138;
sub.f32 %f148, %f121, %f356;
mul.f32 %f149, %f357, %f148;
fma.rn.f32 %f360, %f149, %f128, %f360;
mul.f32 %f152, %f133, %f128;
add.f32 %f153, %f145, %f152;
fma.rn.f32 %f154, %f149, %f152, %f146;
sub.f32 %f156, %f122, %f356;
mul.f32 %f157, %f357, %f156;
fma.rn.f32 %f361, %f157, %f129, %f361;
mul.f32 %f160, %f134, %f129;
add.f32 %f363, %f153, %f160;
fma.rn.f32 %f362, %f157, %f160, %f154;
$L__BB0_23:
st.shared.f32 [%rd8], %f363;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r434, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r5, %r434;
@%p28 bra $L__BB0_29;
add.s32 %r278, %r434, %r15;
mul.wide.s32 %rd81, %r278, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd83];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r434, 1;
setp.gt.u32 %p29, %r434, 3;
mov.u32 %r434, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r5, 0;
mov.f32 %f364, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r2, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f364, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f364, %f364, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f362;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
mov.u32 %r435, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r5, %r435;
@%p34 bra $L__BB0_39;
add.s32 %r279, %r435, %r15;
mul.wide.s32 %rd84, %r279, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd86];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r435, 1;
setp.gt.u32 %p35, %r435, 3;
mov.u32 %r435, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f365, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r2, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f365, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f365, %f365, %f178;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f364;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f365;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f179, %f357, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd10];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd12];
sub.f32 %f197, %f192, %f356;
mul.f32 %f198, %f357, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r280, %f202;
add.f32 %f203, %f202, %f192;
mov.b32 %r284, %f203;
mul.f32 %f206, %f181, %f186;
mul.f32 %f207, %f206, %f2;
sub.f32 %f209, %f193, %f356;
mul.f32 %f210, %f357, %f209;
sub.f32 %f211, %f207, %f37;
mul.f32 %f212, %f38, %f210;
sub.f32 %f213, %f211, %f212;
mul.f32 %f214, %f179, %f213;
mov.b32 %r281, %f214;
add.f32 %f215, %f214, %f193;
mov.b32 %r285, %f215;
mul.f32 %f218, %f182, %f187;
mul.f32 %f219, %f218, %f2;
sub.f32 %f221, %f194, %f356;
mul.f32 %f222, %f357, %f221;
sub.f32 %f223, %f219, %f37;
mul.f32 %f224, %f38, %f222;
sub.f32 %f225, %f223, %f224;
mul.f32 %f226, %f179, %f225;
mov.b32 %r282, %f226;
add.f32 %f227, %f226, %f194;
mov.b32 %r286, %f227;
mul.f32 %f230, %f183, %f188;
mul.f32 %f231, %f230, %f2;
sub.f32 %f233, %f195, %f356;
mul.f32 %f234, %f357, %f233;
sub.f32 %f235, %f231, %f37;
mul.f32 %f236, %f38, %f234;
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f179, %f237;
mov.b32 %r283, %f238;
add.f32 %f239, %f238, %f195;
mov.b32 %r287, %f239;
mad.lo.s32 %r288, %r429, %r3, %r10;
mad.lo.s32 %r289, %r288, %r177, %r14;
mul.wide.s32 %rd89, %r289, 4;
add.s64 %rd87, %rd40, %rd89;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r280,%r281,%r282,%r283};
// end inline asm
add.s64 %rd88, %rd41, %rd89;
// begin inline asm
st.global.cs.v4.s32 [%rd88], {%r284,%r285,%r286,%r287};
// end inline asm
$L__BB0_49:
add.s32 %r429, %r429, 1;
setp.lt.s32 %p41, %r429, %r9;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f358, 0f00000000;
mov.f32 %f359, %f358;
mov.f32 %f360, %f358;
mov.f32 %f361, %f358;
mov.f32 %f370, %f358;
mov.f32 %f371, %f358;
mov.f32 %f372, %f358;
mov.f32 %f373, %f358;
$L__BB0_50:
mov.u32 %r290, %tid.z;
mad.lo.s32 %r291, %r3, %r290, %r7;
mad.lo.s32 %r39, %r291, %r2, %r5;
mul.wide.u32 %rd90, %r39, 4;
add.s64 %rd24, %rd45, %rd90;
clz.b32 %r292, %r3;
mov.u32 %r293, 31;
sub.s32 %r294, %r293, %r292;
mov.u32 %r295, 1;
shl.b32 %r40, %r295, %r294;
setp.lt.u32 %p42, %r7, %r40;
add.s32 %r296, %r40, %r7;
setp.lt.u32 %p43, %r296, %r3;
and.pred %p5, %p42, %p43;
shl.b32 %r297, %r2, %r294;
add.s32 %r298, %r39, %r297;
mul.wide.s32 %rd92, %r298, 4;
add.s64 %rd25, %rd45, %rd92;
shr.u32 %r299, %r40, 31;
add.s32 %r300, %r40, %r299;
shr.s32 %r450, %r300, 1;
st.shared.f32 [%rd24], %f370;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f240, [%rd25];
ld.shared.f32 %f241, [%rd24];
add.f32 %f242, %f240, %f241;
st.shared.f32 [%rd24], %f242;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r436, %r450;
$L__BB0_54:
setp.ge.u32 %p46, %r7, %r436;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r301, %r436, %r2, %r39;
mul.wide.s32 %rd93, %r301, 4;
add.s64 %rd95, %rd45, %rd93;
ld.shared.f32 %f243, [%rd24];
ld.shared.f32 %f244, [%rd95];
add.f32 %f245, %f244, %f243;
st.shared.f32 [%rd24], %f245;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r436, 1;
setp.gt.u32 %p47, %r436, 3;
mov.u32 %r436, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r303, %r39, %r2;
mul.wide.u32 %rd96, %r303, 4;
add.s64 %rd26, %rd45, %rd96;
setp.ne.s32 %p48, %r7, 0;
mov.u32 %r437, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r3, 2;
ld.shared.f32 %f246, [%rd24];
add.f32 %f374, %f246, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f247, [%rd26];
add.f32 %f374, %f374, %f247;
$L__BB0_60:
mov.b32 %r437, %f374;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd24], %f371;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f248, [%rd25];
ld.shared.f32 %f249, [%rd24];
add.f32 %f250, %f248, %f249;
st.shared.f32 [%rd24], %f250;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r438, %r450;
$L__BB0_65:
setp.ge.u32 %p52, %r7, %r438;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r304, %r438, %r2, %r39;
mul.wide.s32 %rd98, %r304, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f251, [%rd24];
ld.shared.f32 %f252, [%rd100];
add.f32 %f253, %f252, %f251;
st.shared.f32 [%rd24], %f253;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r438, 1;
setp.gt.u32 %p53, %r438, 3;
mov.u32 %r438, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r439, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r3, 2;
ld.shared.f32 %f254, [%rd24];
add.f32 %f375, %f254, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f255, [%rd26];
add.f32 %f375, %f375, %f255;
$L__BB0_71:
mov.b32 %r439, %f375;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd24], %f372;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f256, [%rd25];
ld.shared.f32 %f257, [%rd24];
add.f32 %f258, %f256, %f257;
st.shared.f32 [%rd24], %f258;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r440, %r450;
$L__BB0_76:
setp.ge.u32 %p58, %r7, %r440;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r306, %r440, %r2, %r39;
mul.wide.s32 %rd101, %r306, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f259, [%rd24];
ld.shared.f32 %f260, [%rd103];
add.f32 %f261, %f260, %f259;
st.shared.f32 [%rd24], %f261;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r440, 1;
setp.gt.u32 %p59, %r440, 3;
mov.u32 %r440, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r441, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r3, 2;
ld.shared.f32 %f262, [%rd24];
add.f32 %f376, %f262, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f263, [%rd26];
add.f32 %f376, %f376, %f263;
$L__BB0_82:
mov.b32 %r441, %f376;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd24], %f373;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f264, [%rd25];
ld.shared.f32 %f265, [%rd24];
add.f32 %f266, %f264, %f265;
st.shared.f32 [%rd24], %f266;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r442, %r450;
$L__BB0_87:
setp.ge.u32 %p64, %r7, %r442;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r308, %r442, %r2, %r39;
mul.wide.s32 %rd104, %r308, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f267, [%rd24];
ld.shared.f32 %f268, [%rd106];
add.f32 %f269, %f268, %f267;
st.shared.f32 [%rd24], %f269;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r442, 1;
setp.gt.u32 %p65, %r442, 3;
mov.u32 %r442, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r443, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r3, 2;
ld.shared.f32 %f270, [%rd24];
add.f32 %f377, %f270, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f271, [%rd26];
add.f32 %f377, %f377, %f271;
$L__BB0_93:
mov.b32 %r443, %f377;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd24], %f358;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f272, [%rd25];
ld.shared.f32 %f273, [%rd24];
add.f32 %f274, %f272, %f273;
st.shared.f32 [%rd24], %f274;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r444, %r450;
$L__BB0_98:
setp.ge.u32 %p70, %r7, %r444;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r310, %r444, %r2, %r39;
mul.wide.s32 %rd107, %r310, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f275, [%rd24];
ld.shared.f32 %f276, [%rd109];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd24], %f277;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r444, 1;
setp.gt.u32 %p71, %r444, 3;
mov.u32 %r444, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r445, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r3, 2;
ld.shared.f32 %f278, [%rd24];
add.f32 %f378, %f278, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f279, [%rd26];
add.f32 %f378, %f378, %f279;
$L__BB0_104:
mov.b32 %r445, %f378;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd24], %f359;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f280, [%rd25];
ld.shared.f32 %f281, [%rd24];
add.f32 %f282, %f280, %f281;
st.shared.f32 [%rd24], %f282;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r446, %r450;
$L__BB0_109:
setp.ge.u32 %p76, %r7, %r446;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r312, %r446, %r2, %r39;
mul.wide.s32 %rd110, %r312, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f283, [%rd24];
ld.shared.f32 %f284, [%rd112];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd24], %f285;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r446, 1;
setp.gt.u32 %p77, %r446, 3;
mov.u32 %r446, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r447, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r3, 2;
ld.shared.f32 %f286, [%rd24];
add.f32 %f379, %f286, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f287, [%rd26];
add.f32 %f379, %f379, %f287;
$L__BB0_115:
mov.b32 %r447, %f379;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd24], %f360;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f288, [%rd25];
ld.shared.f32 %f289, [%rd24];
add.f32 %f290, %f288, %f289;
st.shared.f32 [%rd24], %f290;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r448, %r450;
$L__BB0_120:
setp.ge.u32 %p82, %r7, %r448;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r314, %r448, %r2, %r39;
mul.wide.s32 %rd113, %r314, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f291, [%rd24];
ld.shared.f32 %f292, [%rd115];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd24], %f293;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r448, 1;
setp.gt.u32 %p83, %r448, 3;
mov.u32 %r448, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r449, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f32 %f294, [%rd24];
add.f32 %f380, %f294, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f295, [%rd26];
add.f32 %f380, %f380, %f295;
$L__BB0_126:
mov.b32 %r449, %f380;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd24], %f361;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f296, [%rd25];
ld.shared.f32 %f297, [%rd24];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd24], %f298;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r7, %r450;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r316, %r450, %r2, %r39;
mul.wide.s32 %rd116, %r316, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f299, [%rd24];
ld.shared.f32 %f300, [%rd118];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd24], %f301;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r450, 1;
setp.gt.u32 %p89, %r450, 3;
mov.u32 %r450, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r451, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f32 %f302, [%rd24];
add.f32 %f381, %f302, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f303, [%rd26];
add.f32 %f381, %f381, %f303;
$L__BB0_136:
mov.b32 %r451, %f381;
$L__BB0_137:
setp.eq.s32 %p141, %r7, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
shl.b32 %r426, %r5, 2;
mov.u32 %r326, %ctaid.y;
mad.lo.s32 %r327, %r177, %r326, %r426;
mul.wide.s32 %rd121, %r327, 4;
add.s64 %rd119, %rd42, %rd121;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r437,%r439,%r441,%r443};
// end inline asm
add.s64 %rd120, %rd43, %rd121;
// begin inline asm
st.volatile.global.v4.s32 [%rd120], {%r445,%r447,%r449,%r451};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r328, %r5, %r7;
or.b32 %r330, %r328, %r290;
setp.ne.s32 %p92, %r330, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd158, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd122, %rd158;
mov.u32 %r331, %ctaid.x;
mov.u32 %r332, %ctaid.z;
mov.u32 %r333, %nctaid.x;
mad.lo.s32 %r334, %r332, %r333, %r331;
mul.wide.s32 %rd123, %r334, 8;
add.s64 %rd29, %rd122, %rd123;
add.s32 %r335, %r8, -1;
setp.eq.s32 %p93, %r74, %r335;
cvt.s64.s32 %rd124, %r8;
mov.u64 %rd125, -9223372036854775807;
sub.s64 %rd126, %rd125, %rd124;
selp.b64 %rd127, %rd126, 1, %p93;
atom.global.add.u64 %rd30, [%rd29], %rd127;
ld.volatile.global.u64 %rd128, [%rd29];
xor.b64 %rd129, %rd128, %rd30;
setp.lt.s64 %p94, %rd129, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r452, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r452;
// end inline asm
setp.lt.u32 %p95, %r452, 256;
selp.u32 %r338, 1, 0, %p95;
shl.b32 %r452, %r452, %r338;
ld.volatile.global.u64 %rd130, [%rd29];
xor.b64 %rd131, %rd130, %rd30;
setp.gt.s64 %p96, %rd131, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r339, %r8, %r2;
add.s32 %r340, %r339, -1;
div.s32 %r77, %r340, %r2;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f384, 0f00000000;
mov.f32 %f385, %f384;
@%p97 bra $L__BB0_149;
add.s32 %r342, %r177, 1;
shr.u32 %r343, %r342, 31;
add.s32 %r344, %r342, %r343;
shr.s32 %r345, %r344, 1;
add.s32 %r346, %r3, %r345;
add.s32 %r347, %r346, -1;
shl.b32 %r348, %r7, 1;
shl.b32 %r349, %r3, 1;
mad.lo.s32 %r350, %r349, %r74, %r348;
or.b32 %r351, %r350, 1;
setp.ge.s32 %p98, %r351, %r177;
div.s32 %r352, %r347, %r3;
setp.ge.s32 %p99, %r74, %r352;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r353, %r3, %r74;
shl.b32 %r354, %r353, 1;
mad.lo.s32 %r355, %r177, %r5, %r354;
add.s32 %r454, %r355, %r348;
mul.lo.s32 %r79, %r177, %r2;
mov.u32 %r341, 0;
mov.f32 %f384, 0f00000000;
mov.u32 %r453, %r5;
mov.u32 %r455, %r341;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r456, %r341;
mov.u32 %r457, %r341;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r453, %r8;
mov.u32 %r456, %r341;
mov.u32 %r457, %r341;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd133, %r454, 4;
add.s64 %rd132, %rd43, %rd133;
// begin inline asm
ld.volatile.global.v2.s32 {%r457,%r456}, [%rd132];
// end inline asm
$L__BB0_148:
mov.b32 %f308, %r457;
add.f32 %f384, %f384, %f308;
mov.b32 %f309, %r456;
add.f32 %f385, %f385, %f309;
add.s32 %r454, %r454, %r79;
add.s32 %r453, %r453, %r2;
add.s32 %r455, %r455, 1;
setp.lt.s32 %p101, %r455, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r362, %r2;
mov.u32 %r363, 31;
sub.s32 %r364, %r363, %r362;
mov.u32 %r365, 1;
shl.b32 %r90, %r365, %r364;
setp.lt.u32 %p102, %r5, %r90;
add.s32 %r366, %r90, %r5;
setp.lt.u32 %p103, %r366, %r2;
and.pred %p7, %p102, %p103;
add.s32 %r367, %r39, %r90;
mul.wide.s32 %rd134, %r367, 4;
add.s64 %rd31, %rd45, %rd134;
shr.u32 %r368, %r90, 31;
add.s32 %r369, %r90, %r368;
shr.s32 %r469, %r369, 1;
st.shared.f32 [%rd24], %f384;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f310, [%rd31];
ld.shared.f32 %f311, [%rd24];
add.f32 %f312, %f310, %f311;
st.shared.f32 [%rd24], %f312;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r458, %r469;
$L__BB0_153:
setp.ge.u32 %p106, %r5, %r458;
@%p106 bra $L__BB0_155;
add.s32 %r370, %r458, %r39;
mul.wide.s32 %rd136, %r370, 4;
add.s64 %rd138, %rd45, %rd136;
ld.shared.f32 %f313, [%rd24];
ld.shared.f32 %f314, [%rd138];
add.f32 %f315, %f314, %f313;
st.shared.f32 [%rd24], %f315;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r458, 1;
setp.gt.u32 %p107, %r458, 3;
mov.u32 %r458, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r372, %r39, 1;
mul.wide.u32 %rd139, %r372, 4;
add.s64 %rd32, %rd45, %rd139;
setp.ne.s32 %p108, %r5, 0;
mov.u32 %r459, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r2, 2;
ld.shared.f32 %f316, [%rd24];
add.f32 %f386, %f316, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f317, [%rd32];
add.f32 %f386, %f386, %f317;
$L__BB0_159:
mov.b32 %r459, %f386;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd24], %f385;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f318, [%rd31];
ld.shared.f32 %f319, [%rd24];
add.f32 %f320, %f318, %f319;
st.shared.f32 [%rd24], %f320;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r460, %r469;
$L__BB0_164:
setp.ge.u32 %p112, %r5, %r460;
@%p112 bra $L__BB0_166;
add.s32 %r373, %r460, %r39;
mul.wide.s32 %rd141, %r373, 4;
add.s64 %rd143, %rd45, %rd141;
ld.shared.f32 %f321, [%rd24];
ld.shared.f32 %f322, [%rd143];
add.f32 %f323, %f322, %f321;
st.shared.f32 [%rd24], %f323;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r460, 1;
setp.gt.u32 %p113, %r460, 3;
mov.u32 %r460, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r461, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r2, 2;
ld.shared.f32 %f324, [%rd24];
add.f32 %f387, %f324, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f325, [%rd32];
add.f32 %f387, %f387, %f325;
$L__BB0_170:
mov.b32 %r461, %f387;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
add.s32 %r375, %r177, 1;
shr.u32 %r376, %r375, 31;
add.s32 %r377, %r375, %r376;
shr.s32 %r378, %r377, 1;
add.s32 %r379, %r3, %r378;
add.s32 %r380, %r379, -1;
div.s32 %r381, %r380, %r3;
setp.ge.s32 %p117, %r74, %r381;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r7, 1;
mul.lo.s32 %r382, %r3, %r74;
shl.b32 %r101, %r382, 1;
add.s32 %r383, %r100, %r101;
or.b32 %r384, %r383, 1;
setp.ge.s32 %p118, %r384, %r177;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd157, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6];
add.s32 %r387, %r101, %r100;
mul.wide.s32 %rd145, %r387, 4;
add.s64 %rd144, %rd157, %rd145;
// begin inline asm
st.global.cs.v2.s32 [%rd144], {%r459,%r461};
// end inline asm
$L__BB0_175:
mov.f32 %f390, 0f00000000;
mov.f32 %f391, %f390;
@%p97 bra $L__BB0_181;
add.s32 %r389, %r177, 1;
shr.u32 %r390, %r389, 31;
add.s32 %r391, %r389, %r390;
shr.s32 %r392, %r391, 1;
add.s32 %r393, %r3, %r392;
add.s32 %r394, %r393, -1;
shl.b32 %r395, %r7, 1;
shl.b32 %r396, %r3, 1;
mad.lo.s32 %r397, %r396, %r74, %r395;
or.b32 %r398, %r397, 1;
setp.ge.s32 %p120, %r398, %r177;
div.s32 %r399, %r394, %r3;
setp.ge.s32 %p121, %r74, %r399;
or.pred %p8, %p121, %p120;
mul.lo.s32 %r400, %r3, %r74;
shl.b32 %r401, %r400, 1;
mad.lo.s32 %r402, %r177, %r5, %r401;
add.s32 %r463, %r402, %r395;
mul.lo.s32 %r103, %r177, %r2;
mov.u32 %r388, 0;
mov.f32 %f390, 0f00000000;
mov.u32 %r462, %r5;
mov.u32 %r464, %r388;
$L__BB0_177:
.pragma "nounroll";
mov.u32 %r465, %r388;
mov.u32 %r466, %r388;
@%p8 bra $L__BB0_180;
setp.ge.s32 %p122, %r462, %r8;
mov.u32 %r465, %r388;
mov.u32 %r466, %r388;
@%p122 bra $L__BB0_180;
mul.wide.s32 %rd147, %r463, 4;
add.s64 %rd146, %rd42, %rd147;
// begin inline asm
ld.volatile.global.v2.s32 {%r466,%r465}, [%rd146];
// end inline asm
$L__BB0_180:
mov.b32 %f330, %r466;
add.f32 %f390, %f390, %f330;
mov.b32 %f331, %r465;
add.f32 %f391, %f391, %f331;
add.s32 %r463, %r463, %r103;
add.s32 %r462, %r462, %r2;
add.s32 %r464, %r464, 1;
setp.lt.s32 %p123, %r464, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd24], %f390;
bar.sync 0;
@%p104 bra $L__BB0_183;
ld.shared.f32 %f332, [%rd31];
ld.shared.f32 %f333, [%rd24];
add.f32 %f334, %f332, %f333;
st.shared.f32 [%rd24], %f334;
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
mov.u32 %r467, %r469;
$L__BB0_185:
setp.ge.u32 %p126, %r5, %r467;
@%p126 bra $L__BB0_187;
add.s32 %r409, %r467, %r39;
mul.wide.s32 %rd148, %r409, 4;
add.s64 %rd150, %rd45, %rd148;
ld.shared.f32 %f335, [%rd24];
ld.shared.f32 %f336, [%rd150];
add.f32 %f337, %f336, %f335;
st.shared.f32 [%rd24], %f337;
$L__BB0_187:
bar.sync 0;
shr.u32 %r115, %r467, 1;
setp.gt.u32 %p127, %r467, 3;
mov.u32 %r467, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
mov.u32 %r468, 0;
@%p108 bra $L__BB0_192;
setp.lt.u32 %p129, %r2, 2;
ld.shared.f32 %f338, [%rd24];
add.f32 %f392, %f338, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f339, [%rd32];
add.f32 %f392, %f392, %f339;
$L__BB0_191:
mov.b32 %r468, %f392;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd24], %f391;
bar.sync 0;
@%p104 bra $L__BB0_194;
ld.shared.f32 %f340, [%rd31];
ld.shared.f32 %f341, [%rd24];
add.f32 %f342, %f340, %f341;
st.shared.f32 [%rd24], %f342;
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
setp.ge.u32 %p132, %r5, %r469;
@%p132 bra $L__BB0_197;
add.s32 %r411, %r469, %r39;
mul.wide.s32 %rd151, %r411, 4;
add.s64 %rd153, %rd45, %rd151;
ld.shared.f32 %f343, [%rd24];
ld.shared.f32 %f344, [%rd153];
add.f32 %f345, %f344, %f343;
st.shared.f32 [%rd24], %f345;
$L__BB0_197:
bar.sync 0;
shr.u32 %r119, %r469, 1;
setp.gt.u32 %p133, %r469, 3;
mov.u32 %r469, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
mov.u32 %r470, 0;
@%p108 bra $L__BB0_202;
setp.lt.u32 %p135, %r2, 2;
ld.shared.f32 %f346, [%rd24];
add.f32 %f393, %f346, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f347, [%rd32];
add.f32 %f393, %f393, %f347;
$L__BB0_201:
mov.b32 %r470, %f393;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
add.s32 %r413, %r177, 1;
shr.u32 %r414, %r413, 31;
add.s32 %r415, %r413, %r414;
shr.s32 %r416, %r415, 1;
add.s32 %r417, %r3, %r416;
add.s32 %r418, %r417, -1;
div.s32 %r419, %r418, %r3;
setp.ge.s32 %p137, %r74, %r419;
@%p137 bra $L__BB0_206;
shl.b32 %r122, %r7, 1;
mul.lo.s32 %r420, %r3, %r74;
shl.b32 %r123, %r420, 1;
add.s32 %r421, %r122, %r123;
or.b32 %r422, %r421, 1;
setp.ge.s32 %p138, %r422, %r177;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd156, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_10df15e9_1033910nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5];
add.s32 %r425, %r123, %r122;
mul.wide.s32 %rd155, %r425, 4;
add.s64 %rd154, %rd156, %rd155;
// begin inline asm
st.global.cs.v2.s32 [%rd154], {%r468,%r470};
// end inline asm
$L__BB0_206:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<394>;
.reg .b32 %r<469>;
.reg .f64 %fd<3>;
.reg .b64 %rd<159>;
ld.param.v2.u32 {%r170, %r171}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
ld.param.v2.u32 {%r174, %r175}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r176, %r177}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3+8];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r204, %r177, 3;
shr.s32 %r205, %r204, 31;
shr.u32 %r206, %r205, 30;
add.s32 %r207, %r204, %r206;
shr.s32 %r2, %r207, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r208, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r209, %r4, 2;
mad.lo.s32 %r210, %r209, %r208, 15;
and.b32 %r211, %r210, -16;
cvt.u64.u32 %rd1, %r211;
mul.lo.s32 %r212, %r4, %r2;
shl.b32 %r213, %r212, 4;
or.b32 %r214, %r213, 15;
and.b32 %r5, %r214, -16;
add.s32 %r215, %r214, %r5;
and.b32 %r216, %r215, -16;
cvt.s64.s32 %rd2, %r216;
mov.u64 %rd45, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_72335arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r217, %r7, 3;
setp.lt.s32 %p10, %r217, %r177;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r218, smem_ptr; }
// end inline asm
shl.b32 %r221, %r6, 4;
add.s32 %r219, %r218, %r221;
mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd37, %rd49;
mov.u32 %r220, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r220, 0;
cp.async.ca.shared.global [%r219], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r222, %r4, 63;
div.s32 %r223, %r222, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r224, %r9, %r223;
add.s32 %r225, %r224, -1;
div.s32 %r10, %r225, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r177;
cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r227, %ctaid.y;
mul.lo.s32 %r228, %r10, %r4;
mul.lo.s32 %r11, %r228, %r227;
mad.lo.s32 %r229, %r2, %r8, %r6;
shl.b32 %r12, %r229, 4;
mul.lo.s32 %r230, %r177, %r8;
cvt.s64.s32 %rd54, %r230;
cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r231, %r11, %r177;
cvt.s64.s32 %rd6, %r231;
mul.lo.s32 %r13, %r177, %r4;
mul.lo.s32 %r14, %r10, %r227;
shl.b32 %r232, %r8, 2;
mad.lo.s32 %r233, %r232, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r234, %tid.z;
mad.lo.s32 %r235, %r4, %r234, %r8;
mad.lo.s32 %r15, %r235, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r236, %r3;
mov.u32 %r237, 31;
sub.s32 %r238, %r237, %r236;
mov.u32 %r239, 1;
shl.b32 %r16, %r239, %r238;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r240, %r16, %r6;
setp.lt.u32 %p15, %r240, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r241, %r15, %r16;
mul.wide.s32 %rd59, %r241, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r242, %r16, 31;
add.s32 %r243, %r16, %r242;
shr.s32 %r17, %r243, 1;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r7, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r244, %r15, 1;
mul.wide.u32 %rd62, %r244, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
mul.wide.s32 %rd63, %r235, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd34;
cvta.to.global.u64 %rd16, %rd35;
add.s64 %rd20, %rd46, %rd51;
mov.u32 %r427, 0;
mov.f32 %f358, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd20; cvt.u32.u64 %r247, smem_ptr; }
// end inline asm
add.s32 %r248, %r247, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r273, smem_ptr; }
// end inline asm
add.s32 %r274, %r273, %r12;
not.pred %p26, %p3;
mov.f32 %f359, %f358;
mov.f32 %f360, %f358;
mov.f32 %f361, %f358;
mov.f32 %f370, %f358;
mov.f32 %f371, %f358;
mov.f32 %f372, %f358;
mov.f32 %f373, %f358;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r245, %r427, %r4, %r8;
add.s32 %r246, %r245, %r11;
setp.gt.s32 %p17, %r246, 63;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r250, %r13, %r427;
cvt.s64.s32 %rd67, %r250;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd36, %rd70;
mov.u32 %r249, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r249, 0;
cp.async.ca.shared.global [%r248], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r251, %r14, %r427;
mad.lo.s32 %r252, %r251, %r4, %r8;
setp.lt.s32 %p19, %r252, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r428, %r429, %r430, %r431}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r261, %r14, %r427;
mad.lo.s32 %r262, %r261, %r4, %r8;
setp.gt.s32 %p20, %r262, 63;
mov.u32 %r428, 0;
mov.u32 %r429, %r428;
mov.u32 %r430, %r428;
mov.u32 %r431, %r428;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r428, %r429, %r430, %r431}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r428, 0;
mov.u32 %r429, %r428;
mov.u32 %r430, %r428;
mov.u32 %r431, %r428;
$L__BB0_15:
add.s32 %r271, %r14, %r427;
mad.lo.s32 %r33, %r271, %r4, %r8;
mov.b32 %f112, %r431;
add.f32 %f373, %f373, %f112;
mov.b32 %f113, %r430;
add.f32 %f372, %f372, %f113;
mov.b32 %f114, %r429;
add.f32 %f371, %f371, %f114;
mov.b32 %f115, %r428;
add.f32 %f370, %f370, %f115;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f356, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r272, %r33, %r170;
mul.wide.s32 %rd71, %r272, 4;
add.s64 %rd72, %rd15, %rd71;
ld.global.f32 %f356, [%rd72];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r276, %r13, %r427;
cvt.s64.s32 %rd75, %r276;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd33, %rd78;
mov.u32 %r275, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r275, 0;
cp.async.ca.shared.global [%r274], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r426, %r14, %r427;
mad.lo.s32 %r425, %r426, %r4, %r8;
setp.gt.s32 %p142, %r425, 63;
mov.f32 %f362, 0f00000000;
mov.f32 %f357, %f362;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r277, %r33, %r174;
mul.wide.s32 %rd79, %r277, 4;
add.s64 %rd80, %rd16, %rd79;
ld.global.f32 %f357, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f363, %f362;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f119, %f120, %f121, %f122}, [%rd12];
sub.f32 %f124, %f119, %f356;
mul.f32 %f125, %f357, %f124;
ld.shared.v4.f32 {%f126, %f127, %f128, %f129}, [%rd7];
fma.rn.f32 %f358, %f125, %f126, %f358;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd10];
mul.f32 %f136, %f131, %f126;
add.f32 %f137, %f136, 0f00000000;
fma.rn.f32 %f138, %f125, %f136, 0f00000000;
sub.f32 %f140, %f120, %f356;
mul.f32 %f141, %f357, %f140;
fma.rn.f32 %f359, %f141, %f127, %f359;
mul.f32 %f144, %f132, %f127;
add.f32 %f145, %f137, %f144;
fma.rn.f32 %f146, %f141, %f144, %f138;
sub.f32 %f148, %f121, %f356;
mul.f32 %f149, %f357, %f148;
fma.rn.f32 %f360, %f149, %f128, %f360;
mul.f32 %f152, %f133, %f128;
add.f32 %f153, %f145, %f152;
fma.rn.f32 %f154, %f149, %f152, %f146;
sub.f32 %f156, %f122, %f356;
mul.f32 %f157, %f357, %f156;
fma.rn.f32 %f361, %f157, %f129, %f361;
mul.f32 %f160, %f134, %f129;
add.f32 %f363, %f153, %f160;
fma.rn.f32 %f362, %f157, %f160, %f154;
$L__BB0_23:
st.shared.f32 [%rd8], %f363;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f161, [%rd9];
ld.shared.f32 %f162, [%rd8];
add.f32 %f163, %f161, %f162;
st.shared.f32 [%rd8], %f163;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r432, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r6, %r432;
@%p28 bra $L__BB0_29;
add.s32 %r278, %r432, %r15;
mul.wide.s32 %rd81, %r278, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd83];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r432, 1;
setp.gt.u32 %p29, %r432, 3;
mov.u32 %r432, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r6, 0;
mov.f32 %f364, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f364, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
add.f32 %f364, %f364, %f169;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f362;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f170, [%rd9];
ld.shared.f32 %f171, [%rd8];
add.f32 %f172, %f170, %f171;
st.shared.f32 [%rd8], %f172;
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
mov.u32 %r433, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r6, %r433;
@%p34 bra $L__BB0_39;
add.s32 %r279, %r433, %r15;
mul.wide.s32 %rd84, %r279, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd86];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r433, 1;
setp.gt.u32 %p35, %r433, 3;
mov.u32 %r433, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f365, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f365, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
add.f32 %f365, %f365, %f178;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f364;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f365;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f179, %f357, %f1;
ld.shared.v4.f32 {%f180, %f181, %f182, %f183}, [%rd10];
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd7];
mul.f32 %f190, %f180, %f185;
mul.f32 %f191, %f190, %f2;
ld.shared.v4.f32 {%f192, %f193, %f194, %f195}, [%rd12];
sub.f32 %f197, %f192, %f356;
mul.f32 %f198, %f357, %f197;
sub.f32 %f199, %f191, %f37;
mul.f32 %f200, %f38, %f198;
sub.f32 %f201, %f199, %f200;
mul.f32 %f202, %f179, %f201;
mov.b32 %r280, %f202;
add.f32 %f203, %f202, %f192;
mov.b32 %r284, %f203;
mul.f32 %f206, %f181, %f186;
mul.f32 %f207, %f206, %f2;
sub.f32 %f209, %f193, %f356;
mul.f32 %f210, %f357, %f209;
sub.f32 %f211, %f207, %f37;
mul.f32 %f212, %f38, %f210;
sub.f32 %f213, %f211, %f212;
mul.f32 %f214, %f179, %f213;
mov.b32 %r281, %f214;
add.f32 %f215, %f214, %f193;
mov.b32 %r285, %f215;
mul.f32 %f218, %f182, %f187;
mul.f32 %f219, %f218, %f2;
sub.f32 %f221, %f194, %f356;
mul.f32 %f222, %f357, %f221;
sub.f32 %f223, %f219, %f37;
mul.f32 %f224, %f38, %f222;
sub.f32 %f225, %f223, %f224;
mul.f32 %f226, %f179, %f225;
mov.b32 %r282, %f226;
add.f32 %f227, %f226, %f194;
mov.b32 %r286, %f227;
mul.f32 %f230, %f183, %f188;
mul.f32 %f231, %f230, %f2;
sub.f32 %f233, %f195, %f356;
mul.f32 %f234, %f357, %f233;
sub.f32 %f235, %f231, %f37;
mul.f32 %f236, %f38, %f234;
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f179, %f237;
mov.b32 %r283, %f238;
add.f32 %f239, %f238, %f195;
mov.b32 %r287, %f239;
mad.lo.s32 %r288, %r33, %r177, %r7;
mul.wide.s32 %rd89, %r288, 4;
add.s64 %rd87, %rd40, %rd89;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r280,%r281,%r282,%r283};
// end inline asm
add.s64 %rd88, %rd41, %rd89;
// begin inline asm
st.global.cs.v4.s32 [%rd88], {%r284,%r285,%r286,%r287};
// end inline asm
$L__BB0_49:
add.s32 %r427, %r427, 1;
setp.lt.s32 %p41, %r427, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f358, 0f00000000;
mov.f32 %f359, %f358;
mov.f32 %f360, %f358;
mov.f32 %f361, %f358;
mov.f32 %f370, %f358;
mov.f32 %f371, %f358;
mov.f32 %f372, %f358;
mov.f32 %f373, %f358;
$L__BB0_50:
mov.u32 %r289, %tid.z;
mad.lo.s32 %r290, %r4, %r289, %r8;
mad.lo.s32 %r39, %r290, %r3, %r6;
mul.wide.u32 %rd90, %r39, 4;
add.s64 %rd24, %rd45, %rd90;
clz.b32 %r291, %r4;
mov.u32 %r292, 31;
sub.s32 %r293, %r292, %r291;
mov.u32 %r294, 1;
shl.b32 %r40, %r294, %r293;
setp.lt.u32 %p42, %r8, %r40;
add.s32 %r295, %r40, %r8;
setp.lt.u32 %p43, %r295, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r296, %r3, %r293;
add.s32 %r297, %r39, %r296;
mul.wide.s32 %rd92, %r297, 4;
add.s64 %rd25, %rd45, %rd92;
shr.u32 %r298, %r40, 31;
add.s32 %r299, %r40, %r298;
shr.s32 %r448, %r299, 1;
st.shared.f32 [%rd24], %f370;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f240, [%rd25];
ld.shared.f32 %f241, [%rd24];
add.f32 %f242, %f240, %f241;
st.shared.f32 [%rd24], %f242;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r434, %r448;
$L__BB0_54:
setp.ge.u32 %p46, %r8, %r434;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r300, %r434, %r3, %r39;
mul.wide.s32 %rd93, %r300, 4;
add.s64 %rd95, %rd45, %rd93;
ld.shared.f32 %f243, [%rd24];
ld.shared.f32 %f244, [%rd95];
add.f32 %f245, %f244, %f243;
st.shared.f32 [%rd24], %f245;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r434, 1;
setp.gt.u32 %p47, %r434, 3;
mov.u32 %r434, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r302, %r39, %r3;
mul.wide.u32 %rd96, %r302, 4;
add.s64 %rd26, %rd45, %rd96;
setp.ne.s32 %p48, %r8, 0;
mov.u32 %r435, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f246, [%rd24];
add.f32 %f374, %f246, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f247, [%rd26];
add.f32 %f374, %f374, %f247;
$L__BB0_60:
mov.b32 %r435, %f374;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd24], %f371;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f248, [%rd25];
ld.shared.f32 %f249, [%rd24];
add.f32 %f250, %f248, %f249;
st.shared.f32 [%rd24], %f250;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r436, %r448;
$L__BB0_65:
setp.ge.u32 %p52, %r8, %r436;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r303, %r436, %r3, %r39;
mul.wide.s32 %rd98, %r303, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f251, [%rd24];
ld.shared.f32 %f252, [%rd100];
add.f32 %f253, %f252, %f251;
st.shared.f32 [%rd24], %f253;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r436, 1;
setp.gt.u32 %p53, %r436, 3;
mov.u32 %r436, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r437, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f254, [%rd24];
add.f32 %f375, %f254, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f255, [%rd26];
add.f32 %f375, %f375, %f255;
$L__BB0_71:
mov.b32 %r437, %f375;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd24], %f372;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f256, [%rd25];
ld.shared.f32 %f257, [%rd24];
add.f32 %f258, %f256, %f257;
st.shared.f32 [%rd24], %f258;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r438, %r448;
$L__BB0_76:
setp.ge.u32 %p58, %r8, %r438;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r305, %r438, %r3, %r39;
mul.wide.s32 %rd101, %r305, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f259, [%rd24];
ld.shared.f32 %f260, [%rd103];
add.f32 %f261, %f260, %f259;
st.shared.f32 [%rd24], %f261;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r438, 1;
setp.gt.u32 %p59, %r438, 3;
mov.u32 %r438, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r439, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f262, [%rd24];
add.f32 %f376, %f262, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f263, [%rd26];
add.f32 %f376, %f376, %f263;
$L__BB0_82:
mov.b32 %r439, %f376;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd24], %f373;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f264, [%rd25];
ld.shared.f32 %f265, [%rd24];
add.f32 %f266, %f264, %f265;
st.shared.f32 [%rd24], %f266;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r440, %r448;
$L__BB0_87:
setp.ge.u32 %p64, %r8, %r440;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r307, %r440, %r3, %r39;
mul.wide.s32 %rd104, %r307, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f267, [%rd24];
ld.shared.f32 %f268, [%rd106];
add.f32 %f269, %f268, %f267;
st.shared.f32 [%rd24], %f269;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r440, 1;
setp.gt.u32 %p65, %r440, 3;
mov.u32 %r440, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r441, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f270, [%rd24];
add.f32 %f377, %f270, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f271, [%rd26];
add.f32 %f377, %f377, %f271;
$L__BB0_93:
mov.b32 %r441, %f377;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd24], %f358;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f272, [%rd25];
ld.shared.f32 %f273, [%rd24];
add.f32 %f274, %f272, %f273;
st.shared.f32 [%rd24], %f274;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r442, %r448;
$L__BB0_98:
setp.ge.u32 %p70, %r8, %r442;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r309, %r442, %r3, %r39;
mul.wide.s32 %rd107, %r309, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f275, [%rd24];
ld.shared.f32 %f276, [%rd109];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd24], %f277;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r442, 1;
setp.gt.u32 %p71, %r442, 3;
mov.u32 %r442, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r443, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f278, [%rd24];
add.f32 %f378, %f278, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f279, [%rd26];
add.f32 %f378, %f378, %f279;
$L__BB0_104:
mov.b32 %r443, %f378;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd24], %f359;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f280, [%rd25];
ld.shared.f32 %f281, [%rd24];
add.f32 %f282, %f280, %f281;
st.shared.f32 [%rd24], %f282;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r444, %r448;
$L__BB0_109:
setp.ge.u32 %p76, %r8, %r444;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r311, %r444, %r3, %r39;
mul.wide.s32 %rd110, %r311, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f283, [%rd24];
ld.shared.f32 %f284, [%rd112];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd24], %f285;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r444, 1;
setp.gt.u32 %p77, %r444, 3;
mov.u32 %r444, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r445, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f286, [%rd24];
add.f32 %f379, %f286, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f287, [%rd26];
add.f32 %f379, %f379, %f287;
$L__BB0_115:
mov.b32 %r445, %f379;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd24], %f360;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f288, [%rd25];
ld.shared.f32 %f289, [%rd24];
add.f32 %f290, %f288, %f289;
st.shared.f32 [%rd24], %f290;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r446, %r448;
$L__BB0_120:
setp.ge.u32 %p82, %r8, %r446;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r313, %r446, %r3, %r39;
mul.wide.s32 %rd113, %r313, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f291, [%rd24];
ld.shared.f32 %f292, [%rd115];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd24], %f293;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r446, 1;
setp.gt.u32 %p83, %r446, 3;
mov.u32 %r446, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r447, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f294, [%rd24];
add.f32 %f380, %f294, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f295, [%rd26];
add.f32 %f380, %f380, %f295;
$L__BB0_126:
mov.b32 %r447, %f380;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd24], %f361;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f296, [%rd25];
ld.shared.f32 %f297, [%rd24];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd24], %f298;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r8, %r448;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r315, %r448, %r3, %r39;
mul.wide.s32 %rd116, %r315, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f299, [%rd24];
ld.shared.f32 %f300, [%rd118];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd24], %f301;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r448, 1;
setp.gt.u32 %p89, %r448, 3;
mov.u32 %r448, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r449, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f302, [%rd24];
add.f32 %f381, %f302, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f303, [%rd26];
add.f32 %f381, %f381, %f303;
$L__BB0_136:
mov.b32 %r449, %f381;
$L__BB0_137:
setp.eq.s32 %p141, %r8, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
mov.u32 %r325, %ctaid.y;
mad.lo.s32 %r326, %r177, %r325, %r7;
mul.wide.s32 %rd121, %r326, 4;
add.s64 %rd119, %rd42, %rd121;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r435,%r437,%r439,%r441};
// end inline asm
add.s64 %rd120, %rd43, %rd121;
// begin inline asm
st.volatile.global.v4.s32 [%rd120], {%r443,%r445,%r447,%r449};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r327, %r6, %r8;
or.b32 %r329, %r327, %r289;
setp.ne.s32 %p92, %r329, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd158, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd122, %rd158;
mov.u32 %r330, %ctaid.x;
mov.u32 %r331, %ctaid.z;
mov.u32 %r332, %nctaid.x;
mad.lo.s32 %r333, %r331, %r332, %r330;
mul.wide.s32 %rd123, %r333, 8;
add.s64 %rd29, %rd122, %rd123;
add.s32 %r334, %r9, -1;
setp.eq.s32 %p93, %r74, %r334;
cvt.s64.s32 %rd124, %r9;
mov.u64 %rd125, -9223372036854775807;
sub.s64 %rd126, %rd125, %rd124;
selp.b64 %rd127, %rd126, 1, %p93;
atom.global.add.u64 %rd30, [%rd29], %rd127;
ld.volatile.global.u64 %rd128, [%rd29];
xor.b64 %rd129, %rd128, %rd30;
setp.lt.s64 %p94, %rd129, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r450, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r450;
// end inline asm
setp.lt.u32 %p95, %r450, 256;
selp.u32 %r337, 1, 0, %p95;
shl.b32 %r450, %r450, %r337;
ld.volatile.global.u64 %rd130, [%rd29];
xor.b64 %rd131, %rd130, %rd30;
setp.gt.s64 %p96, %rd131, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r338, %r9, %r3;
add.s32 %r339, %r338, -1;
div.s32 %r77, %r339, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f384, 0f00000000;
mov.f32 %f385, %f384;
@%p97 bra $L__BB0_149;
add.s32 %r341, %r177, 1;
shr.u32 %r342, %r341, 31;
add.s32 %r343, %r341, %r342;
shr.s32 %r344, %r343, 1;
add.s32 %r345, %r4, %r344;
add.s32 %r346, %r345, -1;
shl.b32 %r347, %r8, 1;
shl.b32 %r348, %r4, 1;
mad.lo.s32 %r349, %r348, %r74, %r347;
or.b32 %r350, %r349, 1;
setp.ge.s32 %p98, %r350, %r177;
div.s32 %r351, %r346, %r4;
setp.ge.s32 %p99, %r74, %r351;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r352, %r4, %r74;
shl.b32 %r353, %r352, 1;
mad.lo.s32 %r354, %r177, %r6, %r353;
add.s32 %r452, %r354, %r347;
mul.lo.s32 %r79, %r177, %r3;
mov.u32 %r340, 0;
mov.f32 %f384, 0f00000000;
mov.u32 %r451, %r6;
mov.u32 %r453, %r340;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r454, %r340;
mov.u32 %r455, %r340;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r451, %r9;
mov.u32 %r454, %r340;
mov.u32 %r455, %r340;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd133, %r452, 4;
add.s64 %rd132, %rd43, %rd133;
// begin inline asm
ld.volatile.global.v2.s32 {%r455,%r454}, [%rd132];
// end inline asm
$L__BB0_148:
mov.b32 %f308, %r455;
add.f32 %f384, %f384, %f308;
mov.b32 %f309, %r454;
add.f32 %f385, %f385, %f309;
add.s32 %r452, %r452, %r79;
add.s32 %r451, %r451, %r3;
add.s32 %r453, %r453, 1;
setp.lt.s32 %p101, %r453, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r361, %r3;
mov.u32 %r362, 31;
sub.s32 %r363, %r362, %r361;
mov.u32 %r364, 1;
shl.b32 %r90, %r364, %r363;
setp.lt.u32 %p102, %r6, %r90;
add.s32 %r365, %r90, %r6;
setp.lt.u32 %p103, %r365, %r3;
and.pred %p7, %p102, %p103;
add.s32 %r366, %r39, %r90;
mul.wide.s32 %rd134, %r366, 4;
add.s64 %rd31, %rd45, %rd134;
shr.u32 %r367, %r90, 31;
add.s32 %r368, %r90, %r367;
shr.s32 %r467, %r368, 1;
st.shared.f32 [%rd24], %f384;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f310, [%rd31];
ld.shared.f32 %f311, [%rd24];
add.f32 %f312, %f310, %f311;
st.shared.f32 [%rd24], %f312;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r456, %r467;
$L__BB0_153:
setp.ge.u32 %p106, %r6, %r456;
@%p106 bra $L__BB0_155;
add.s32 %r369, %r456, %r39;
mul.wide.s32 %rd136, %r369, 4;
add.s64 %rd138, %rd45, %rd136;
ld.shared.f32 %f313, [%rd24];
ld.shared.f32 %f314, [%rd138];
add.f32 %f315, %f314, %f313;
st.shared.f32 [%rd24], %f315;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r456, 1;
setp.gt.u32 %p107, %r456, 3;
mov.u32 %r456, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r371, %r39, 1;
mul.wide.u32 %rd139, %r371, 4;
add.s64 %rd32, %rd45, %rd139;
setp.ne.s32 %p108, %r6, 0;
mov.u32 %r457, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f316, [%rd24];
add.f32 %f386, %f316, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f317, [%rd32];
add.f32 %f386, %f386, %f317;
$L__BB0_159:
mov.b32 %r457, %f386;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd24], %f385;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f318, [%rd31];
ld.shared.f32 %f319, [%rd24];
add.f32 %f320, %f318, %f319;
st.shared.f32 [%rd24], %f320;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r458, %r467;
$L__BB0_164:
setp.ge.u32 %p112, %r6, %r458;
@%p112 bra $L__BB0_166;
add.s32 %r372, %r458, %r39;
mul.wide.s32 %rd141, %r372, 4;
add.s64 %rd143, %rd45, %rd141;
ld.shared.f32 %f321, [%rd24];
ld.shared.f32 %f322, [%rd143];
add.f32 %f323, %f322, %f321;
st.shared.f32 [%rd24], %f323;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r458, 1;
setp.gt.u32 %p113, %r458, 3;
mov.u32 %r458, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r459, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f324, [%rd24];
add.f32 %f387, %f324, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f325, [%rd32];
add.f32 %f387, %f387, %f325;
$L__BB0_170:
mov.b32 %r459, %f387;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
add.s32 %r374, %r177, 1;
shr.u32 %r375, %r374, 31;
add.s32 %r376, %r374, %r375;
shr.s32 %r377, %r376, 1;
add.s32 %r378, %r4, %r377;
add.s32 %r379, %r378, -1;
div.s32 %r380, %r379, %r4;
setp.ge.s32 %p117, %r74, %r380;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r8, 1;
mul.lo.s32 %r381, %r4, %r74;
shl.b32 %r101, %r381, 1;
add.s32 %r382, %r100, %r101;
or.b32 %r383, %r382, 1;
setp.ge.s32 %p118, %r383, %r177;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd157, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6];
add.s32 %r386, %r101, %r100;
mul.wide.s32 %rd145, %r386, 4;
add.s64 %rd144, %rd157, %rd145;
// begin inline asm
st.global.cs.v2.s32 [%rd144], {%r457,%r459};
// end inline asm
$L__BB0_175:
mov.f32 %f390, 0f00000000;
mov.f32 %f391, %f390;
@%p97 bra $L__BB0_181;
add.s32 %r388, %r177, 1;
shr.u32 %r389, %r388, 31;
add.s32 %r390, %r388, %r389;
shr.s32 %r391, %r390, 1;
add.s32 %r392, %r4, %r391;
add.s32 %r393, %r392, -1;
shl.b32 %r394, %r8, 1;
shl.b32 %r395, %r4, 1;
mad.lo.s32 %r396, %r395, %r74, %r394;
or.b32 %r397, %r396, 1;
setp.ge.s32 %p120, %r397, %r177;
div.s32 %r398, %r393, %r4;
setp.ge.s32 %p121, %r74, %r398;
or.pred %p8, %p121, %p120;
mul.lo.s32 %r399, %r4, %r74;
shl.b32 %r400, %r399, 1;
mad.lo.s32 %r401, %r177, %r6, %r400;
add.s32 %r461, %r401, %r394;
mul.lo.s32 %r103, %r177, %r3;
mov.u32 %r387, 0;
mov.f32 %f390, 0f00000000;
mov.u32 %r460, %r6;
mov.u32 %r462, %r387;
$L__BB0_177:
.pragma "nounroll";
mov.u32 %r463, %r387;
mov.u32 %r464, %r387;
@%p8 bra $L__BB0_180;
setp.ge.s32 %p122, %r460, %r9;
mov.u32 %r463, %r387;
mov.u32 %r464, %r387;
@%p122 bra $L__BB0_180;
mul.wide.s32 %rd147, %r461, 4;
add.s64 %rd146, %rd42, %rd147;
// begin inline asm
ld.volatile.global.v2.s32 {%r464,%r463}, [%rd146];
// end inline asm
$L__BB0_180:
mov.b32 %f330, %r464;
add.f32 %f390, %f390, %f330;
mov.b32 %f331, %r463;
add.f32 %f391, %f391, %f331;
add.s32 %r461, %r461, %r103;
add.s32 %r460, %r460, %r3;
add.s32 %r462, %r462, 1;
setp.lt.s32 %p123, %r462, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd24], %f390;
bar.sync 0;
@%p104 bra $L__BB0_183;
ld.shared.f32 %f332, [%rd31];
ld.shared.f32 %f333, [%rd24];
add.f32 %f334, %f332, %f333;
st.shared.f32 [%rd24], %f334;
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
mov.u32 %r465, %r467;
$L__BB0_185:
setp.ge.u32 %p126, %r6, %r465;
@%p126 bra $L__BB0_187;
add.s32 %r408, %r465, %r39;
mul.wide.s32 %rd148, %r408, 4;
add.s64 %rd150, %rd45, %rd148;
ld.shared.f32 %f335, [%rd24];
ld.shared.f32 %f336, [%rd150];
add.f32 %f337, %f336, %f335;
st.shared.f32 [%rd24], %f337;
$L__BB0_187:
bar.sync 0;
shr.u32 %r115, %r465, 1;
setp.gt.u32 %p127, %r465, 3;
mov.u32 %r465, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
mov.u32 %r466, 0;
@%p108 bra $L__BB0_192;
setp.lt.u32 %p129, %r3, 2;
ld.shared.f32 %f338, [%rd24];
add.f32 %f392, %f338, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f339, [%rd32];
add.f32 %f392, %f392, %f339;
$L__BB0_191:
mov.b32 %r466, %f392;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd24], %f391;
bar.sync 0;
@%p104 bra $L__BB0_194;
ld.shared.f32 %f340, [%rd31];
ld.shared.f32 %f341, [%rd24];
add.f32 %f342, %f340, %f341;
st.shared.f32 [%rd24], %f342;
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
setp.ge.u32 %p132, %r6, %r467;
@%p132 bra $L__BB0_197;
add.s32 %r410, %r467, %r39;
mul.wide.s32 %rd151, %r410, 4;
add.s64 %rd153, %rd45, %rd151;
ld.shared.f32 %f343, [%rd24];
ld.shared.f32 %f344, [%rd153];
add.f32 %f345, %f344, %f343;
st.shared.f32 [%rd24], %f345;
$L__BB0_197:
bar.sync 0;
shr.u32 %r119, %r467, 1;
setp.gt.u32 %p133, %r467, 3;
mov.u32 %r467, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
mov.u32 %r468, 0;
@%p108 bra $L__BB0_202;
setp.lt.u32 %p135, %r3, 2;
ld.shared.f32 %f346, [%rd24];
add.f32 %f393, %f346, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f347, [%rd32];
add.f32 %f393, %f393, %f347;
$L__BB0_201:
mov.b32 %r468, %f393;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
add.s32 %r412, %r177, 1;
shr.u32 %r413, %r412, 31;
add.s32 %r414, %r412, %r413;
shr.s32 %r415, %r414, 1;
add.s32 %r416, %r4, %r415;
add.s32 %r417, %r416, -1;
div.s32 %r418, %r417, %r4;
setp.ge.s32 %p137, %r74, %r418;
@%p137 bra $L__BB0_206;
shl.b32 %r122, %r8, 1;
mul.lo.s32 %r419, %r4, %r74;
shl.b32 %r123, %r419, 1;
add.s32 %r420, %r122, %r123;
or.b32 %r421, %r420, 1;
setp.ge.s32 %p138, %r421, %r177;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd156, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_58_cu_f6f9a5f6_723310nvfuser_58ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5];
add.s32 %r424, %r123, %r122;
mul.wide.s32 %rd155, %r424, 4;
add.s64 %rd154, %rd156, %rd155;
// begin inline asm
st.global.cs.v2.s32 [%rd154], {%r466,%r468};
// end inline asm
$L__BB0_206:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -30,11 +30,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11[16]
)
{
.reg .pred %p<143>;
.reg .f32 %f<394>;
- .reg .b32 %r<471>;
+ .reg .b32 %r<469>;
.reg .f64 %fd<3>;
.reg .b64 %rd<159>;
ld.param.v2.u32 {%r170, %r171}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
@@ -51,119 +51,119 @@
ld.param.u64 %rd33, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r204, %r177, 3;
shr.s32 %r205, %r204, 31;
shr.u32 %r206, %r205, 30;
add.s32 %r207, %r204, %r206;
- shr.s32 %r208, %r207, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r209, %r208, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r210, %r3, 2;
- mad.lo.s32 %r211, %r210, %r209, 15;
- and.b32 %r212, %r211, -16;
- cvt.u64.u32 %rd1, %r212;
- mul.lo.s32 %r213, %r3, %r208;
- shl.b32 %r214, %r213, 4;
- or.b32 %r215, %r214, 15;
- and.b32 %r4, %r215, -16;
- add.s32 %r216, %r215, %r4;
- and.b32 %r217, %r216, -16;
- cvt.s64.s32 %rd2, %r217;
+ shr.s32 %r2, %r207, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r208, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r209, %r4, 2;
+ mad.lo.s32 %r210, %r209, %r208, 15;
+ and.b32 %r211, %r210, -16;
+ cvt.u64.u32 %rd1, %r211;
+ mul.lo.s32 %r212, %r4, %r2;
+ shl.b32 %r213, %r212, 4;
+ or.b32 %r214, %r213, 15;
+ and.b32 %r5, %r214, -16;
+ add.s32 %r215, %r214, %r5;
+ and.b32 %r216, %r215, -16;
+ cvt.s64.s32 %rd2, %r216;
mov.u64 %rd45, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p9, %r5, %r208;
- shl.b32 %r6, %r5, 2;
- or.b32 %r218, %r6, 3;
- setp.lt.s32 %p10, %r218, %r177;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p9, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r217, %r7, 3;
+ setp.lt.s32 %p10, %r217, %r177;
and.pred %p1, %p10, %p9;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p11, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r219, smem_ptr; }
-
-
- shl.b32 %r222, %r5, 4;
- add.s32 %r220, %r219, %r222;
- mul.wide.s32 %rd49, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r218, smem_ptr; }
+
+
+ shl.b32 %r221, %r6, 4;
+ add.s32 %r219, %r218, %r221;
+ mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd37, %rd49;
- mov.u32 %r221, 0;
+ mov.u32 %r220, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r221, 0;
- cp.async.ca.shared.global [%r220], [%rd48], 16, p0;
+ setp.ne.b32 p0, %r220, 0;
+ cp.async.ca.shared.global [%r219], [%rd48], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r223, %r3, 63;
- div.s32 %r224, %r223, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r225, %r8, %r224;
- add.s32 %r226, %r225, -1;
- div.s32 %r9, %r226, %r8;
- setp.gt.s32 %p13, %r9, 0;
+ add.s32 %r222, %r4, 63;
+ div.s32 %r223, %r222, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r224, %r9, %r223;
+ add.s32 %r225, %r224, -1;
+ div.s32 %r10, %r225, %r9;
+ setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r177;
- cvt.s64.s32 %rd50, %r4;
+ cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
- mov.u32 %r228, %ctaid.y;
- mul.lo.s32 %r229, %r9, %r3;
- mul.lo.s32 %r10, %r229, %r228;
- shl.b32 %r230, %r7, 2;
- shl.b32 %r231, %r5, 4;
- mad.lo.s32 %r11, %r230, %r177, %r231;
- mul.lo.s32 %r232, %r177, %r7;
- cvt.s64.s32 %rd54, %r232;
- cvt.s64.s32 %rd55, %r6;
+ mov.u32 %r227, %ctaid.y;
+ mul.lo.s32 %r228, %r10, %r4;
+ mul.lo.s32 %r11, %r228, %r227;
+ mad.lo.s32 %r229, %r2, %r8, %r6;
+ shl.b32 %r12, %r229, 4;
+ mul.lo.s32 %r230, %r177, %r8;
+ cvt.s64.s32 %rd54, %r230;
+ cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
- mul.lo.s32 %r233, %r10, %r177;
- cvt.s64.s32 %rd6, %r233;
- mul.lo.s32 %r12, %r177, %r3;
- mul.lo.s32 %r13, %r9, %r228;
- add.s32 %r14, %r232, %r6;
+ mul.lo.s32 %r231, %r11, %r177;
+ cvt.s64.s32 %rd6, %r231;
+ mul.lo.s32 %r13, %r177, %r4;
+ mul.lo.s32 %r14, %r10, %r227;
+ shl.b32 %r232, %r8, 2;
+ mad.lo.s32 %r233, %r232, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
- mul.wide.s32 %rd57, %r14, 4;
+ mul.wide.s32 %rd57, %r233, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r234, %tid.z;
- mad.lo.s32 %r235, %r3, %r234, %r7;
- mad.lo.s32 %r15, %r235, %r2, %r5;
+ mad.lo.s32 %r235, %r4, %r234, %r8;
+ mad.lo.s32 %r15, %r235, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
- clz.b32 %r236, %r2;
+ clz.b32 %r236, %r3;
mov.u32 %r237, 31;
sub.s32 %r238, %r237, %r236;
mov.u32 %r239, 1;
shl.b32 %r16, %r239, %r238;
- setp.lt.u32 %p14, %r5, %r16;
- add.s32 %r240, %r16, %r5;
- setp.lt.u32 %p15, %r240, %r2;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r240, %r16, %r6;
+ setp.lt.u32 %p15, %r240, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r241, %r15, %r16;
mul.wide.s32 %rd59, %r241, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r242, %r16, 31;
add.s32 %r243, %r16, %r242;
shr.s32 %r17, %r243, 1;
add.s64 %rd60, %rd45, %rd4;
- mul.wide.s32 %rd61, %r6, 4;
+ mul.wide.s32 %rd61, %r7, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r244, %r15, 1;
mul.wide.u32 %rd62, %r244, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
@@ -171,23 +171,23 @@
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd34;
cvta.to.global.u64 %rd16, %rd35;
add.s64 %rd20, %rd46, %rd51;
- mov.u32 %r429, 0;
+ mov.u32 %r427, 0;
mov.f32 %f358, 0f00000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd20; cvt.u32.u64 %r247, smem_ptr; }
- add.s32 %r248, %r11, %r247;
+ add.s32 %r248, %r247, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r273, smem_ptr; }
- add.s32 %r274, %r11, %r273;
+ add.s32 %r274, %r273, %r12;
not.pred %p26, %p3;
mov.f32 %f359, %f358;
mov.f32 %f360, %f358;
mov.f32 %f361, %f358;
mov.f32 %f370, %f358;
@@ -197,16 +197,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r245, %r429, %r3, %r7;
- add.s32 %r246, %r245, %r10;
+ mad.lo.s32 %r245, %r427, %r4, %r8;
+ add.s32 %r246, %r245, %r11;
setp.gt.s32 %p17, %r246, 63;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r250, %r12, %r429;
+ mul.lo.s32 %r250, %r13, %r427;
cvt.s64.s32 %rd67, %r250;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd36, %rd70;
@@ -225,53 +225,53 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r251, %r13, %r429;
- mad.lo.s32 %r252, %r251, %r3, %r7;
+ add.s32 %r251, %r14, %r427;
+ mad.lo.s32 %r252, %r251, %r4, %r8;
setp.lt.s32 %p19, %r252, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r430, %r431, %r432, %r433}, [%rd7];
+ ld.shared.v4.u32 {%r428, %r429, %r430, %r431}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r261, %r13, %r429;
- mad.lo.s32 %r262, %r261, %r3, %r7;
+ add.s32 %r261, %r14, %r427;
+ mad.lo.s32 %r262, %r261, %r4, %r8;
setp.gt.s32 %p20, %r262, 63;
- mov.u32 %r430, 0;
- mov.u32 %r431, %r430;
- mov.u32 %r432, %r430;
- mov.u32 %r433, %r430;
+ mov.u32 %r428, 0;
+ mov.u32 %r429, %r428;
+ mov.u32 %r430, %r428;
+ mov.u32 %r431, %r428;
@%p20 bra $L__BB0_15;
- ld.shared.v4.u32 {%r430, %r431, %r432, %r433}, [%rd7];
+ ld.shared.v4.u32 {%r428, %r429, %r430, %r431}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r430, 0;
- mov.u32 %r431, %r430;
- mov.u32 %r432, %r430;
- mov.u32 %r433, %r430;
+ mov.u32 %r428, 0;
+ mov.u32 %r429, %r428;
+ mov.u32 %r430, %r428;
+ mov.u32 %r431, %r428;
$L__BB0_15:
- add.s32 %r271, %r13, %r429;
- mad.lo.s32 %r33, %r271, %r3, %r7;
- mov.b32 %f112, %r433;
+ add.s32 %r271, %r14, %r427;
+ mad.lo.s32 %r33, %r271, %r4, %r8;
+ mov.b32 %f112, %r431;
add.f32 %f373, %f373, %f112;
- mov.b32 %f113, %r432;
+ mov.b32 %f113, %r430;
add.f32 %f372, %f372, %f113;
- mov.b32 %f114, %r431;
+ mov.b32 %f114, %r429;
add.f32 %f371, %f371, %f114;
- mov.b32 %f115, %r430;
+ mov.b32 %f115, %r428;
add.f32 %f370, %f370, %f115;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f356, 0f00000000;
@%p21 bra $L__BB0_17;
@@ -284,11 +284,11 @@
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
- mul.lo.s32 %r276, %r12, %r429;
+ mul.lo.s32 %r276, %r13, %r427;
cvt.s64.s32 %rd75, %r276;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd33, %rd78;
@@ -301,13 +301,13 @@
}
$L__BB0_19:
- add.s32 %r428, %r13, %r429;
- mad.lo.s32 %r427, %r428, %r3, %r7;
- setp.gt.s32 %p142, %r427, 63;
+ add.s32 %r426, %r14, %r427;
+ mad.lo.s32 %r425, %r426, %r4, %r8;
+ setp.gt.s32 %p142, %r425, 63;
mov.f32 %f362, 0f00000000;
mov.f32 %f357, %f362;
@%p142 bra $L__BB0_21;
mul.lo.s32 %r277, %r33, %r174;
@@ -364,37 +364,37 @@
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
- mov.u32 %r434, %r17;
+ mov.u32 %r432, %r17;
$L__BB0_27:
- setp.ge.u32 %p28, %r5, %r434;
+ setp.ge.u32 %p28, %r6, %r432;
@%p28 bra $L__BB0_29;
- add.s32 %r278, %r434, %r15;
+ add.s32 %r278, %r432, %r15;
mul.wide.s32 %rd81, %r278, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f164, [%rd8];
ld.shared.f32 %f165, [%rd83];
add.f32 %f166, %f165, %f164;
st.shared.f32 [%rd8], %f166;
$L__BB0_29:
bar.sync 0;
- shr.u32 %r35, %r434, 1;
- setp.gt.u32 %p29, %r434, 3;
- mov.u32 %r434, %r35;
+ shr.u32 %r35, %r432, 1;
+ setp.gt.u32 %p29, %r432, 3;
+ mov.u32 %r432, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p30, %r5, 0;
+ setp.ne.s32 %p30, %r6, 0;
mov.f32 %f364, 0f00000000;
@%p30 bra $L__BB0_33;
- setp.lt.u32 %p31, %r2, 2;
+ setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f168, [%rd8];
add.f32 %f364, %f168, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f169, [%rd11];
@@ -414,36 +414,36 @@
$L__BB0_35:
setp.lt.s32 %p139, %r16, 4;
bar.sync 0;
@%p139 bra $L__BB0_40;
- mov.u32 %r435, %r17;
+ mov.u32 %r433, %r17;
$L__BB0_37:
- setp.ge.u32 %p34, %r5, %r435;
+ setp.ge.u32 %p34, %r6, %r433;
@%p34 bra $L__BB0_39;
- add.s32 %r279, %r435, %r15;
+ add.s32 %r279, %r433, %r15;
mul.wide.s32 %rd84, %r279, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f173, [%rd8];
ld.shared.f32 %f174, [%rd86];
add.f32 %f175, %f174, %f173;
st.shared.f32 [%rd8], %f175;
$L__BB0_39:
bar.sync 0;
- shr.u32 %r37, %r435, 1;
- setp.gt.u32 %p35, %r435, 3;
- mov.u32 %r435, %r37;
+ shr.u32 %r37, %r433, 1;
+ setp.gt.u32 %p35, %r433, 3;
+ mov.u32 %r433, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f365, 0f00000000;
@%p30 bra $L__BB0_43;
- setp.lt.u32 %p37, %r2, 2;
+ setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f177, [%rd8];
add.f32 %f365, %f177, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f178, [%rd11];
@@ -515,25 +515,24 @@
sub.f32 %f237, %f235, %f236;
mul.f32 %f238, %f179, %f237;
mov.b32 %r283, %f238;
add.f32 %f239, %f238, %f195;
mov.b32 %r287, %f239;
- mad.lo.s32 %r288, %r429, %r3, %r10;
- mad.lo.s32 %r289, %r288, %r177, %r14;
- mul.wide.s32 %rd89, %r289, 4;
+ mad.lo.s32 %r288, %r33, %r177, %r7;
+ mul.wide.s32 %rd89, %r288, 4;
add.s64 %rd87, %rd40, %rd89;
st.global.cs.v4.s32 [%rd87], {%r280,%r281,%r282,%r283};
add.s64 %rd88, %rd41, %rd89;
st.global.cs.v4.s32 [%rd88], {%r284,%r285,%r286,%r287};
$L__BB0_49:
- add.s32 %r429, %r429, 1;
- setp.lt.s32 %p41, %r429, %r9;
+ add.s32 %r427, %r427, 1;
+ setp.lt.s32 %p41, %r427, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f358, 0f00000000;
@@ -544,31 +543,31 @@
mov.f32 %f371, %f358;
mov.f32 %f372, %f358;
mov.f32 %f373, %f358;
$L__BB0_50:
- mov.u32 %r290, %tid.z;
- mad.lo.s32 %r291, %r3, %r290, %r7;
- mad.lo.s32 %r39, %r291, %r2, %r5;
+ mov.u32 %r289, %tid.z;
+ mad.lo.s32 %r290, %r4, %r289, %r8;
+ mad.lo.s32 %r39, %r290, %r3, %r6;
mul.wide.u32 %rd90, %r39, 4;
add.s64 %rd24, %rd45, %rd90;
- clz.b32 %r292, %r3;
- mov.u32 %r293, 31;
- sub.s32 %r294, %r293, %r292;
- mov.u32 %r295, 1;
- shl.b32 %r40, %r295, %r294;
- setp.lt.u32 %p42, %r7, %r40;
- add.s32 %r296, %r40, %r7;
- setp.lt.u32 %p43, %r296, %r3;
+ clz.b32 %r291, %r4;
+ mov.u32 %r292, 31;
+ sub.s32 %r293, %r292, %r291;
+ mov.u32 %r294, 1;
+ shl.b32 %r40, %r294, %r293;
+ setp.lt.u32 %p42, %r8, %r40;
+ add.s32 %r295, %r40, %r8;
+ setp.lt.u32 %p43, %r295, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r297, %r2, %r294;
- add.s32 %r298, %r39, %r297;
- mul.wide.s32 %rd92, %r298, 4;
+ shl.b32 %r296, %r3, %r293;
+ add.s32 %r297, %r39, %r296;
+ mul.wide.s32 %rd92, %r297, 4;
add.s64 %rd25, %rd45, %rd92;
- shr.u32 %r299, %r40, 31;
- add.s32 %r300, %r40, %r299;
- shr.s32 %r450, %r300, 1;
+ shr.u32 %r298, %r40, 31;
+ add.s32 %r299, %r40, %r298;
+ shr.s32 %r448, %r299, 1;
st.shared.f32 [%rd24], %f370;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
@@ -580,49 +579,49 @@
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
- mov.u32 %r436, %r450;
+ mov.u32 %r434, %r448;
$L__BB0_54:
- setp.ge.u32 %p46, %r7, %r436;
+ setp.ge.u32 %p46, %r8, %r434;
@%p46 bra $L__BB0_56;
- mad.lo.s32 %r301, %r436, %r2, %r39;
- mul.wide.s32 %rd93, %r301, 4;
+ mad.lo.s32 %r300, %r434, %r3, %r39;
+ mul.wide.s32 %rd93, %r300, 4;
add.s64 %rd95, %rd45, %rd93;
ld.shared.f32 %f243, [%rd24];
ld.shared.f32 %f244, [%rd95];
add.f32 %f245, %f244, %f243;
st.shared.f32 [%rd24], %f245;
$L__BB0_56:
bar.sync 0;
- shr.u32 %r43, %r436, 1;
- setp.gt.u32 %p47, %r436, 3;
- mov.u32 %r436, %r43;
+ shr.u32 %r43, %r434, 1;
+ setp.gt.u32 %p47, %r434, 3;
+ mov.u32 %r434, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
- add.s32 %r303, %r39, %r2;
- mul.wide.u32 %rd96, %r303, 4;
+ add.s32 %r302, %r39, %r3;
+ mul.wide.u32 %rd96, %r302, 4;
add.s64 %rd26, %rd45, %rd96;
- setp.ne.s32 %p48, %r7, 0;
- mov.u32 %r437, 0;
+ setp.ne.s32 %p48, %r8, 0;
+ mov.u32 %r435, 0;
@%p48 bra $L__BB0_61;
- setp.lt.u32 %p49, %r3, 2;
+ setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f246, [%rd24];
add.f32 %f374, %f246, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f247, [%rd26];
add.f32 %f374, %f374, %f247;
$L__BB0_60:
- mov.b32 %r437, %f374;
+ mov.b32 %r435, %f374;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd24], %f371;
bar.sync 0;
@@ -635,45 +634,45 @@
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
- mov.u32 %r438, %r450;
+ mov.u32 %r436, %r448;
$L__BB0_65:
- setp.ge.u32 %p52, %r7, %r438;
+ setp.ge.u32 %p52, %r8, %r436;
@%p52 bra $L__BB0_67;
- mad.lo.s32 %r304, %r438, %r2, %r39;
- mul.wide.s32 %rd98, %r304, 4;
+ mad.lo.s32 %r303, %r436, %r3, %r39;
+ mul.wide.s32 %rd98, %r303, 4;
add.s64 %rd100, %rd45, %rd98;
ld.shared.f32 %f251, [%rd24];
ld.shared.f32 %f252, [%rd100];
add.f32 %f253, %f252, %f251;
st.shared.f32 [%rd24], %f253;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r47, %r438, 1;
- setp.gt.u32 %p53, %r438, 3;
- mov.u32 %r438, %r47;
+ shr.u32 %r47, %r436, 1;
+ setp.gt.u32 %p53, %r436, 3;
+ mov.u32 %r436, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
- mov.u32 %r439, 0;
+ mov.u32 %r437, 0;
@%p48 bra $L__BB0_72;
- setp.lt.u32 %p55, %r3, 2;
+ setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f254, [%rd24];
add.f32 %f375, %f254, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f255, [%rd26];
add.f32 %f375, %f375, %f255;
$L__BB0_71:
- mov.b32 %r439, %f375;
+ mov.b32 %r437, %f375;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd24], %f372;
bar.sync 0;
@@ -686,45 +685,45 @@
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
- mov.u32 %r440, %r450;
+ mov.u32 %r438, %r448;
$L__BB0_76:
- setp.ge.u32 %p58, %r7, %r440;
+ setp.ge.u32 %p58, %r8, %r438;
@%p58 bra $L__BB0_78;
- mad.lo.s32 %r306, %r440, %r2, %r39;
- mul.wide.s32 %rd101, %r306, 4;
+ mad.lo.s32 %r305, %r438, %r3, %r39;
+ mul.wide.s32 %rd101, %r305, 4;
add.s64 %rd103, %rd45, %rd101;
ld.shared.f32 %f259, [%rd24];
ld.shared.f32 %f260, [%rd103];
add.f32 %f261, %f260, %f259;
st.shared.f32 [%rd24], %f261;
$L__BB0_78:
bar.sync 0;
- shr.u32 %r51, %r440, 1;
- setp.gt.u32 %p59, %r440, 3;
- mov.u32 %r440, %r51;
+ shr.u32 %r51, %r438, 1;
+ setp.gt.u32 %p59, %r438, 3;
+ mov.u32 %r438, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
- mov.u32 %r441, 0;
+ mov.u32 %r439, 0;
@%p48 bra $L__BB0_83;
- setp.lt.u32 %p61, %r3, 2;
+ setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f262, [%rd24];
add.f32 %f376, %f262, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f263, [%rd26];
add.f32 %f376, %f376, %f263;
$L__BB0_82:
- mov.b32 %r441, %f376;
+ mov.b32 %r439, %f376;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd24], %f373;
bar.sync 0;
@@ -737,45 +736,45 @@
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
- mov.u32 %r442, %r450;
+ mov.u32 %r440, %r448;
$L__BB0_87:
- setp.ge.u32 %p64, %r7, %r442;
+ setp.ge.u32 %p64, %r8, %r440;
@%p64 bra $L__BB0_89;
- mad.lo.s32 %r308, %r442, %r2, %r39;
- mul.wide.s32 %rd104, %r308, 4;
+ mad.lo.s32 %r307, %r440, %r3, %r39;
+ mul.wide.s32 %rd104, %r307, 4;
add.s64 %rd106, %rd45, %rd104;
ld.shared.f32 %f267, [%rd24];
ld.shared.f32 %f268, [%rd106];
add.f32 %f269, %f268, %f267;
st.shared.f32 [%rd24], %f269;
$L__BB0_89:
bar.sync 0;
- shr.u32 %r55, %r442, 1;
- setp.gt.u32 %p65, %r442, 3;
- mov.u32 %r442, %r55;
+ shr.u32 %r55, %r440, 1;
+ setp.gt.u32 %p65, %r440, 3;
+ mov.u32 %r440, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
- mov.u32 %r443, 0;
+ mov.u32 %r441, 0;
@%p48 bra $L__BB0_94;
- setp.lt.u32 %p67, %r3, 2;
+ setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f270, [%rd24];
add.f32 %f377, %f270, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f271, [%rd26];
add.f32 %f377, %f377, %f271;
$L__BB0_93:
- mov.b32 %r443, %f377;
+ mov.b32 %r441, %f377;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd24], %f358;
bar.sync 0;
@@ -788,45 +787,45 @@
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
- mov.u32 %r444, %r450;
+ mov.u32 %r442, %r448;
$L__BB0_98:
- setp.ge.u32 %p70, %r7, %r444;
+ setp.ge.u32 %p70, %r8, %r442;
@%p70 bra $L__BB0_100;
- mad.lo.s32 %r310, %r444, %r2, %r39;
- mul.wide.s32 %rd107, %r310, 4;
+ mad.lo.s32 %r309, %r442, %r3, %r39;
+ mul.wide.s32 %rd107, %r309, 4;
add.s64 %rd109, %rd45, %rd107;
ld.shared.f32 %f275, [%rd24];
ld.shared.f32 %f276, [%rd109];
add.f32 %f277, %f276, %f275;
st.shared.f32 [%rd24], %f277;
$L__BB0_100:
bar.sync 0;
- shr.u32 %r59, %r444, 1;
- setp.gt.u32 %p71, %r444, 3;
- mov.u32 %r444, %r59;
+ shr.u32 %r59, %r442, 1;
+ setp.gt.u32 %p71, %r442, 3;
+ mov.u32 %r442, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
- mov.u32 %r445, 0;
+ mov.u32 %r443, 0;
@%p48 bra $L__BB0_105;
- setp.lt.u32 %p73, %r3, 2;
+ setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f278, [%rd24];
add.f32 %f378, %f278, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f279, [%rd26];
add.f32 %f378, %f378, %f279;
$L__BB0_104:
- mov.b32 %r445, %f378;
+ mov.b32 %r443, %f378;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd24], %f359;
bar.sync 0;
@@ -839,45 +838,45 @@
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
- mov.u32 %r446, %r450;
+ mov.u32 %r444, %r448;
$L__BB0_109:
- setp.ge.u32 %p76, %r7, %r446;
+ setp.ge.u32 %p76, %r8, %r444;
@%p76 bra $L__BB0_111;
- mad.lo.s32 %r312, %r446, %r2, %r39;
- mul.wide.s32 %rd110, %r312, 4;
+ mad.lo.s32 %r311, %r444, %r3, %r39;
+ mul.wide.s32 %rd110, %r311, 4;
add.s64 %rd112, %rd45, %rd110;
ld.shared.f32 %f283, [%rd24];
ld.shared.f32 %f284, [%rd112];
add.f32 %f285, %f284, %f283;
st.shared.f32 [%rd24], %f285;
$L__BB0_111:
bar.sync 0;
- shr.u32 %r63, %r446, 1;
- setp.gt.u32 %p77, %r446, 3;
- mov.u32 %r446, %r63;
+ shr.u32 %r63, %r444, 1;
+ setp.gt.u32 %p77, %r444, 3;
+ mov.u32 %r444, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
- mov.u32 %r447, 0;
+ mov.u32 %r445, 0;
@%p48 bra $L__BB0_116;
- setp.lt.u32 %p79, %r3, 2;
+ setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f286, [%rd24];
add.f32 %f379, %f286, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f287, [%rd26];
add.f32 %f379, %f379, %f287;
$L__BB0_115:
- mov.b32 %r447, %f379;
+ mov.b32 %r445, %f379;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd24], %f360;
bar.sync 0;
@@ -890,45 +889,45 @@
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
- mov.u32 %r448, %r450;
+ mov.u32 %r446, %r448;
$L__BB0_120:
- setp.ge.u32 %p82, %r7, %r448;
+ setp.ge.u32 %p82, %r8, %r446;
@%p82 bra $L__BB0_122;
- mad.lo.s32 %r314, %r448, %r2, %r39;
- mul.wide.s32 %rd113, %r314, 4;
+ mad.lo.s32 %r313, %r446, %r3, %r39;
+ mul.wide.s32 %rd113, %r313, 4;
add.s64 %rd115, %rd45, %rd113;
ld.shared.f32 %f291, [%rd24];
ld.shared.f32 %f292, [%rd115];
add.f32 %f293, %f292, %f291;
st.shared.f32 [%rd24], %f293;
$L__BB0_122:
bar.sync 0;
- shr.u32 %r67, %r448, 1;
- setp.gt.u32 %p83, %r448, 3;
- mov.u32 %r448, %r67;
+ shr.u32 %r67, %r446, 1;
+ setp.gt.u32 %p83, %r446, 3;
+ mov.u32 %r446, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
- mov.u32 %r449, 0;
+ mov.u32 %r447, 0;
@%p48 bra $L__BB0_127;
- setp.lt.u32 %p85, %r3, 2;
+ setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f294, [%rd24];
add.f32 %f380, %f294, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f295, [%rd26];
add.f32 %f380, %f380, %f295;
$L__BB0_126:
- mov.b32 %r449, %f380;
+ mov.b32 %r447, %f380;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd24], %f361;
bar.sync 0;
@@ -942,185 +941,184 @@
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
- setp.ge.u32 %p88, %r7, %r450;
+ setp.ge.u32 %p88, %r8, %r448;
@%p88 bra $L__BB0_132;
- mad.lo.s32 %r316, %r450, %r2, %r39;
- mul.wide.s32 %rd116, %r316, 4;
+ mad.lo.s32 %r315, %r448, %r3, %r39;
+ mul.wide.s32 %rd116, %r315, 4;
add.s64 %rd118, %rd45, %rd116;
ld.shared.f32 %f299, [%rd24];
ld.shared.f32 %f300, [%rd118];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd24], %f301;
$L__BB0_132:
bar.sync 0;
- shr.u32 %r71, %r450, 1;
- setp.gt.u32 %p89, %r450, 3;
- mov.u32 %r450, %r71;
+ shr.u32 %r71, %r448, 1;
+ setp.gt.u32 %p89, %r448, 3;
+ mov.u32 %r448, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
- mov.u32 %r451, 0;
+ mov.u32 %r449, 0;
@%p48 bra $L__BB0_137;
- setp.lt.u32 %p91, %r3, 2;
+ setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f302, [%rd24];
add.f32 %f381, %f302, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f303, [%rd26];
add.f32 %f381, %f381, %f303;
$L__BB0_136:
- mov.b32 %r451, %f381;
+ mov.b32 %r449, %f381;
$L__BB0_137:
- setp.eq.s32 %p141, %r7, 0;
+ setp.eq.s32 %p141, %r8, 0;
and.pred %p140, %p141, %p1;
bar.sync 0;
@%p140 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
- shl.b32 %r426, %r5, 2;
- mov.u32 %r326, %ctaid.y;
- mad.lo.s32 %r327, %r177, %r326, %r426;
- mul.wide.s32 %rd121, %r327, 4;
+ mov.u32 %r325, %ctaid.y;
+ mad.lo.s32 %r326, %r177, %r325, %r7;
+ mul.wide.s32 %rd121, %r326, 4;
add.s64 %rd119, %rd42, %rd121;
- st.volatile.global.v4.s32 [%rd119], {%r437,%r439,%r441,%r443};
+ st.volatile.global.v4.s32 [%rd119], {%r435,%r437,%r439,%r441};
add.s64 %rd120, %rd43, %rd121;
- st.volatile.global.v4.s32 [%rd120], {%r445,%r447,%r449,%r451};
+ st.volatile.global.v4.s32 [%rd120], {%r443,%r445,%r447,%r449};
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r328, %r5, %r7;
- or.b32 %r330, %r328, %r290;
- setp.ne.s32 %p92, %r330, 0;
+ or.b32 %r327, %r6, %r8;
+ or.b32 %r329, %r327, %r289;
+ setp.ne.s32 %p92, %r329, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd158, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11];
cvta.to.global.u64 %rd122, %rd158;
- mov.u32 %r331, %ctaid.x;
- mov.u32 %r332, %ctaid.z;
- mov.u32 %r333, %nctaid.x;
- mad.lo.s32 %r334, %r332, %r333, %r331;
- mul.wide.s32 %rd123, %r334, 8;
+ mov.u32 %r330, %ctaid.x;
+ mov.u32 %r331, %ctaid.z;
+ mov.u32 %r332, %nctaid.x;
+ mad.lo.s32 %r333, %r331, %r332, %r330;
+ mul.wide.s32 %rd123, %r333, 8;
add.s64 %rd29, %rd122, %rd123;
- add.s32 %r335, %r8, -1;
- setp.eq.s32 %p93, %r74, %r335;
- cvt.s64.s32 %rd124, %r8;
+ add.s32 %r334, %r9, -1;
+ setp.eq.s32 %p93, %r74, %r334;
+ cvt.s64.s32 %rd124, %r9;
mov.u64 %rd125, -9223372036854775807;
sub.s64 %rd126, %rd125, %rd124;
selp.b64 %rd127, %rd126, 1, %p93;
atom.global.add.u64 %rd30, [%rd29], %rd127;
ld.volatile.global.u64 %rd128, [%rd29];
xor.b64 %rd129, %rd128, %rd30;
setp.lt.s64 %p94, %rd129, 0;
@%p94 bra $L__BB0_143;
- mov.u32 %r452, 8;
+ mov.u32 %r450, 8;
$L__BB0_142:
- nanosleep.u32 %r452;
-
- setp.lt.u32 %p95, %r452, 256;
- selp.u32 %r338, 1, 0, %p95;
- shl.b32 %r452, %r452, %r338;
+ nanosleep.u32 %r450;
+
+ setp.lt.u32 %p95, %r450, 256;
+ selp.u32 %r337, 1, 0, %p95;
+ shl.b32 %r450, %r450, %r337;
ld.volatile.global.u64 %rd130, [%rd29];
xor.b64 %rd131, %rd130, %rd30;
setp.gt.s64 %p96, %rd131, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
- add.s32 %r339, %r8, %r2;
- add.s32 %r340, %r339, -1;
- div.s32 %r77, %r340, %r2;
+ add.s32 %r338, %r9, %r3;
+ add.s32 %r339, %r338, -1;
+ div.s32 %r77, %r339, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f384, 0f00000000;
mov.f32 %f385, %f384;
@%p97 bra $L__BB0_149;
- add.s32 %r342, %r177, 1;
- shr.u32 %r343, %r342, 31;
- add.s32 %r344, %r342, %r343;
- shr.s32 %r345, %r344, 1;
- add.s32 %r346, %r3, %r345;
- add.s32 %r347, %r346, -1;
- shl.b32 %r348, %r7, 1;
- shl.b32 %r349, %r3, 1;
- mad.lo.s32 %r350, %r349, %r74, %r348;
- or.b32 %r351, %r350, 1;
- setp.ge.s32 %p98, %r351, %r177;
- div.s32 %r352, %r347, %r3;
- setp.ge.s32 %p99, %r74, %r352;
+ add.s32 %r341, %r177, 1;
+ shr.u32 %r342, %r341, 31;
+ add.s32 %r343, %r341, %r342;
+ shr.s32 %r344, %r343, 1;
+ add.s32 %r345, %r4, %r344;
+ add.s32 %r346, %r345, -1;
+ shl.b32 %r347, %r8, 1;
+ shl.b32 %r348, %r4, 1;
+ mad.lo.s32 %r349, %r348, %r74, %r347;
+ or.b32 %r350, %r349, 1;
+ setp.ge.s32 %p98, %r350, %r177;
+ div.s32 %r351, %r346, %r4;
+ setp.ge.s32 %p99, %r74, %r351;
or.pred %p6, %p99, %p98;
- mul.lo.s32 %r353, %r3, %r74;
- shl.b32 %r354, %r353, 1;
- mad.lo.s32 %r355, %r177, %r5, %r354;
- add.s32 %r454, %r355, %r348;
- mul.lo.s32 %r79, %r177, %r2;
- mov.u32 %r341, 0;
+ mul.lo.s32 %r352, %r4, %r74;
+ shl.b32 %r353, %r352, 1;
+ mad.lo.s32 %r354, %r177, %r6, %r353;
+ add.s32 %r452, %r354, %r347;
+ mul.lo.s32 %r79, %r177, %r3;
+ mov.u32 %r340, 0;
mov.f32 %f384, 0f00000000;
- mov.u32 %r453, %r5;
- mov.u32 %r455, %r341;
+ mov.u32 %r451, %r6;
+ mov.u32 %r453, %r340;
$L__BB0_145:
.pragma "nounroll";
- mov.u32 %r456, %r341;
- mov.u32 %r457, %r341;
+ mov.u32 %r454, %r340;
+ mov.u32 %r455, %r340;
@%p6 bra $L__BB0_148;
- setp.ge.s32 %p100, %r453, %r8;
- mov.u32 %r456, %r341;
- mov.u32 %r457, %r341;
+ setp.ge.s32 %p100, %r451, %r9;
+ mov.u32 %r454, %r340;
+ mov.u32 %r455, %r340;
@%p100 bra $L__BB0_148;
- mul.wide.s32 %rd133, %r454, 4;
+ mul.wide.s32 %rd133, %r452, 4;
add.s64 %rd132, %rd43, %rd133;
- ld.volatile.global.v2.s32 {%r457,%r456}, [%rd132];
+ ld.volatile.global.v2.s32 {%r455,%r454}, [%rd132];
$L__BB0_148:
- mov.b32 %f308, %r457;
+ mov.b32 %f308, %r455;
add.f32 %f384, %f384, %f308;
- mov.b32 %f309, %r456;
+ mov.b32 %f309, %r454;
add.f32 %f385, %f385, %f309;
- add.s32 %r454, %r454, %r79;
- add.s32 %r453, %r453, %r2;
- add.s32 %r455, %r455, 1;
- setp.lt.s32 %p101, %r455, %r77;
+ add.s32 %r452, %r452, %r79;
+ add.s32 %r451, %r451, %r3;
+ add.s32 %r453, %r453, 1;
+ setp.lt.s32 %p101, %r453, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
- clz.b32 %r362, %r2;
- mov.u32 %r363, 31;
- sub.s32 %r364, %r363, %r362;
- mov.u32 %r365, 1;
- shl.b32 %r90, %r365, %r364;
- setp.lt.u32 %p102, %r5, %r90;
- add.s32 %r366, %r90, %r5;
- setp.lt.u32 %p103, %r366, %r2;
+ clz.b32 %r361, %r3;
+ mov.u32 %r362, 31;
+ sub.s32 %r363, %r362, %r361;
+ mov.u32 %r364, 1;
+ shl.b32 %r90, %r364, %r363;
+ setp.lt.u32 %p102, %r6, %r90;
+ add.s32 %r365, %r90, %r6;
+ setp.lt.u32 %p103, %r365, %r3;
and.pred %p7, %p102, %p103;
- add.s32 %r367, %r39, %r90;
- mul.wide.s32 %rd134, %r367, 4;
+ add.s32 %r366, %r39, %r90;
+ mul.wide.s32 %rd134, %r366, 4;
add.s64 %rd31, %rd45, %rd134;
- shr.u32 %r368, %r90, 31;
- add.s32 %r369, %r90, %r368;
- shr.s32 %r469, %r369, 1;
+ shr.u32 %r367, %r90, 31;
+ add.s32 %r368, %r90, %r367;
+ shr.s32 %r467, %r368, 1;
st.shared.f32 [%rd24], %f384;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
@@ -1132,49 +1130,49 @@
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
- mov.u32 %r458, %r469;
+ mov.u32 %r456, %r467;
$L__BB0_153:
- setp.ge.u32 %p106, %r5, %r458;
+ setp.ge.u32 %p106, %r6, %r456;
@%p106 bra $L__BB0_155;
- add.s32 %r370, %r458, %r39;
- mul.wide.s32 %rd136, %r370, 4;
+ add.s32 %r369, %r456, %r39;
+ mul.wide.s32 %rd136, %r369, 4;
add.s64 %rd138, %rd45, %rd136;
ld.shared.f32 %f313, [%rd24];
ld.shared.f32 %f314, [%rd138];
add.f32 %f315, %f314, %f313;
st.shared.f32 [%rd24], %f315;
$L__BB0_155:
bar.sync 0;
- shr.u32 %r93, %r458, 1;
- setp.gt.u32 %p107, %r458, 3;
- mov.u32 %r458, %r93;
+ shr.u32 %r93, %r456, 1;
+ setp.gt.u32 %p107, %r456, 3;
+ mov.u32 %r456, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
- add.s32 %r372, %r39, 1;
- mul.wide.u32 %rd139, %r372, 4;
+ add.s32 %r371, %r39, 1;
+ mul.wide.u32 %rd139, %r371, 4;
add.s64 %rd32, %rd45, %rd139;
- setp.ne.s32 %p108, %r5, 0;
- mov.u32 %r459, 0;
+ setp.ne.s32 %p108, %r6, 0;
+ mov.u32 %r457, 0;
@%p108 bra $L__BB0_160;
- setp.lt.u32 %p109, %r2, 2;
+ setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f316, [%rd24];
add.f32 %f386, %f316, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f317, [%rd32];
add.f32 %f386, %f386, %f317;
$L__BB0_159:
- mov.b32 %r459, %f386;
+ mov.b32 %r457, %f386;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd24], %f385;
bar.sync 0;
@@ -1187,131 +1185,131 @@
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
- mov.u32 %r460, %r469;
+ mov.u32 %r458, %r467;
$L__BB0_164:
- setp.ge.u32 %p112, %r5, %r460;
+ setp.ge.u32 %p112, %r6, %r458;
@%p112 bra $L__BB0_166;
- add.s32 %r373, %r460, %r39;
- mul.wide.s32 %rd141, %r373, 4;
+ add.s32 %r372, %r458, %r39;
+ mul.wide.s32 %rd141, %r372, 4;
add.s64 %rd143, %rd45, %rd141;
ld.shared.f32 %f321, [%rd24];
ld.shared.f32 %f322, [%rd143];
add.f32 %f323, %f322, %f321;
st.shared.f32 [%rd24], %f323;
$L__BB0_166:
bar.sync 0;
- shr.u32 %r97, %r460, 1;
- setp.gt.u32 %p113, %r460, 3;
- mov.u32 %r460, %r97;
+ shr.u32 %r97, %r458, 1;
+ setp.gt.u32 %p113, %r458, 3;
+ mov.u32 %r458, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
- mov.u32 %r461, 0;
+ mov.u32 %r459, 0;
@%p108 bra $L__BB0_171;
- setp.lt.u32 %p115, %r2, 2;
+ setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f324, [%rd24];
add.f32 %f387, %f324, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f325, [%rd32];
add.f32 %f387, %f387, %f325;
$L__BB0_170:
- mov.b32 %r461, %f387;
+ mov.b32 %r459, %f387;
$L__BB0_171:
bar.sync 0;
@%p108 bra $L__BB0_175;
- add.s32 %r375, %r177, 1;
- shr.u32 %r376, %r375, 31;
- add.s32 %r377, %r375, %r376;
- shr.s32 %r378, %r377, 1;
- add.s32 %r379, %r3, %r378;
- add.s32 %r380, %r379, -1;
- div.s32 %r381, %r380, %r3;
- setp.ge.s32 %p117, %r74, %r381;
+ add.s32 %r374, %r177, 1;
+ shr.u32 %r375, %r374, 31;
+ add.s32 %r376, %r374, %r375;
+ shr.s32 %r377, %r376, 1;
+ add.s32 %r378, %r4, %r377;
+ add.s32 %r379, %r378, -1;
+ div.s32 %r380, %r379, %r4;
+ setp.ge.s32 %p117, %r74, %r380;
@%p117 bra $L__BB0_175;
- shl.b32 %r100, %r7, 1;
- mul.lo.s32 %r382, %r3, %r74;
- shl.b32 %r101, %r382, 1;
- add.s32 %r383, %r100, %r101;
- or.b32 %r384, %r383, 1;
- setp.ge.s32 %p118, %r384, %r177;
+ shl.b32 %r100, %r8, 1;
+ mul.lo.s32 %r381, %r4, %r74;
+ shl.b32 %r101, %r381, 1;
+ add.s32 %r382, %r100, %r101;
+ or.b32 %r383, %r382, 1;
+ setp.ge.s32 %p118, %r383, %r177;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd157, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6];
- add.s32 %r387, %r101, %r100;
- mul.wide.s32 %rd145, %r387, 4;
+ add.s32 %r386, %r101, %r100;
+ mul.wide.s32 %rd145, %r386, 4;
add.s64 %rd144, %rd157, %rd145;
- st.global.cs.v2.s32 [%rd144], {%r459,%r461};
+ st.global.cs.v2.s32 [%rd144], {%r457,%r459};
$L__BB0_175:
mov.f32 %f390, 0f00000000;
mov.f32 %f391, %f390;
@%p97 bra $L__BB0_181;
- add.s32 %r389, %r177, 1;
- shr.u32 %r390, %r389, 31;
- add.s32 %r391, %r389, %r390;
- shr.s32 %r392, %r391, 1;
- add.s32 %r393, %r3, %r392;
- add.s32 %r394, %r393, -1;
- shl.b32 %r395, %r7, 1;
- shl.b32 %r396, %r3, 1;
- mad.lo.s32 %r397, %r396, %r74, %r395;
- or.b32 %r398, %r397, 1;
- setp.ge.s32 %p120, %r398, %r177;
- div.s32 %r399, %r394, %r3;
- setp.ge.s32 %p121, %r74, %r399;
+ add.s32 %r388, %r177, 1;
+ shr.u32 %r389, %r388, 31;
+ add.s32 %r390, %r388, %r389;
+ shr.s32 %r391, %r390, 1;
+ add.s32 %r392, %r4, %r391;
+ add.s32 %r393, %r392, -1;
+ shl.b32 %r394, %r8, 1;
+ shl.b32 %r395, %r4, 1;
+ mad.lo.s32 %r396, %r395, %r74, %r394;
+ or.b32 %r397, %r396, 1;
+ setp.ge.s32 %p120, %r397, %r177;
+ div.s32 %r398, %r393, %r4;
+ setp.ge.s32 %p121, %r74, %r398;
or.pred %p8, %p121, %p120;
- mul.lo.s32 %r400, %r3, %r74;
- shl.b32 %r401, %r400, 1;
- mad.lo.s32 %r402, %r177, %r5, %r401;
- add.s32 %r463, %r402, %r395;
- mul.lo.s32 %r103, %r177, %r2;
- mov.u32 %r388, 0;
+ mul.lo.s32 %r399, %r4, %r74;
+ shl.b32 %r400, %r399, 1;
+ mad.lo.s32 %r401, %r177, %r6, %r400;
+ add.s32 %r461, %r401, %r394;
+ mul.lo.s32 %r103, %r177, %r3;
+ mov.u32 %r387, 0;
mov.f32 %f390, 0f00000000;
- mov.u32 %r462, %r5;
- mov.u32 %r464, %r388;
+ mov.u32 %r460, %r6;
+ mov.u32 %r462, %r387;
$L__BB0_177:
.pragma "nounroll";
- mov.u32 %r465, %r388;
- mov.u32 %r466, %r388;
+ mov.u32 %r463, %r387;
+ mov.u32 %r464, %r387;
@%p8 bra $L__BB0_180;
- setp.ge.s32 %p122, %r462, %r8;
- mov.u32 %r465, %r388;
- mov.u32 %r466, %r388;
+ setp.ge.s32 %p122, %r460, %r9;
+ mov.u32 %r463, %r387;
+ mov.u32 %r464, %r387;
@%p122 bra $L__BB0_180;
- mul.wide.s32 %rd147, %r463, 4;
+ mul.wide.s32 %rd147, %r461, 4;
add.s64 %rd146, %rd42, %rd147;
- ld.volatile.global.v2.s32 {%r466,%r465}, [%rd146];
+ ld.volatile.global.v2.s32 {%r464,%r463}, [%rd146];
$L__BB0_180:
- mov.b32 %f330, %r466;
+ mov.b32 %f330, %r464;
add.f32 %f390, %f390, %f330;
- mov.b32 %f331, %r465;
+ mov.b32 %f331, %r463;
add.f32 %f391, %f391, %f331;
- add.s32 %r463, %r463, %r103;
- add.s32 %r462, %r462, %r2;
- add.s32 %r464, %r464, 1;
- setp.lt.s32 %p123, %r464, %r77;
+ add.s32 %r461, %r461, %r103;
+ add.s32 %r460, %r460, %r3;
+ add.s32 %r462, %r462, 1;
+ setp.lt.s32 %p123, %r462, %r77;
@%p123 bra $L__BB0_177;
$L__BB0_181:
st.shared.f32 [%rd24], %f390;
bar.sync 0;
@@ -1324,45 +1322,45 @@
$L__BB0_183:
bar.sync 0;
@%p105 bra $L__BB0_188;
- mov.u32 %r467, %r469;
+ mov.u32 %r465, %r467;
$L__BB0_185:
- setp.ge.u32 %p126, %r5, %r467;
+ setp.ge.u32 %p126, %r6, %r465;
@%p126 bra $L__BB0_187;
- add.s32 %r409, %r467, %r39;
- mul.wide.s32 %rd148, %r409, 4;
+ add.s32 %r408, %r465, %r39;
+ mul.wide.s32 %rd148, %r408, 4;
add.s64 %rd150, %rd45, %rd148;
ld.shared.f32 %f335, [%rd24];
ld.shared.f32 %f336, [%rd150];
add.f32 %f337, %f336, %f335;
st.shared.f32 [%rd24], %f337;
$L__BB0_187:
bar.sync 0;
- shr.u32 %r115, %r467, 1;
- setp.gt.u32 %p127, %r467, 3;
- mov.u32 %r467, %r115;
+ shr.u32 %r115, %r465, 1;
+ setp.gt.u32 %p127, %r465, 3;
+ mov.u32 %r465, %r115;
@%p127 bra $L__BB0_185;
$L__BB0_188:
- mov.u32 %r468, 0;
+ mov.u32 %r466, 0;
@%p108 bra $L__BB0_192;
- setp.lt.u32 %p129, %r2, 2;
+ setp.lt.u32 %p129, %r3, 2;
ld.shared.f32 %f338, [%rd24];
add.f32 %f392, %f338, 0f00000000;
@%p129 bra $L__BB0_191;
ld.shared.f32 %f339, [%rd32];
add.f32 %f392, %f392, %f339;
$L__BB0_191:
- mov.b32 %r468, %f392;
+ mov.b32 %r466, %f392;
$L__BB0_192:
bar.sync 0;
st.shared.f32 [%rd24], %f391;
bar.sync 0;
@@ -1376,71 +1374,71 @@
$L__BB0_194:
bar.sync 0;
@%p105 bra $L__BB0_198;
$L__BB0_195:
- setp.ge.u32 %p132, %r5, %r469;
+ setp.ge.u32 %p132, %r6, %r467;
@%p132 bra $L__BB0_197;
- add.s32 %r411, %r469, %r39;
- mul.wide.s32 %rd151, %r411, 4;
+ add.s32 %r410, %r467, %r39;
+ mul.wide.s32 %rd151, %r410, 4;
add.s64 %rd153, %rd45, %rd151;
ld.shared.f32 %f343, [%rd24];
ld.shared.f32 %f344, [%rd153];
add.f32 %f345, %f344, %f343;
st.shared.f32 [%rd24], %f345;
$L__BB0_197:
bar.sync 0;
- shr.u32 %r119, %r469, 1;
- setp.gt.u32 %p133, %r469, 3;
- mov.u32 %r469, %r119;
+ shr.u32 %r119, %r467, 1;
+ setp.gt.u32 %p133, %r467, 3;
+ mov.u32 %r467, %r119;
@%p133 bra $L__BB0_195;
$L__BB0_198:
- mov.u32 %r470, 0;
+ mov.u32 %r468, 0;
@%p108 bra $L__BB0_202;
- setp.lt.u32 %p135, %r2, 2;
+ setp.lt.u32 %p135, %r3, 2;
ld.shared.f32 %f346, [%rd24];
add.f32 %f393, %f346, 0f00000000;
@%p135 bra $L__BB0_201;
ld.shared.f32 %f347, [%rd32];
add.f32 %f393, %f393, %f347;
$L__BB0_201:
- mov.b32 %r470, %f393;
+ mov.b32 %r468, %f393;
$L__BB0_202:
bar.sync 0;
@%p108 bra $L__BB0_206;
- add.s32 %r413, %r177, 1;
- shr.u32 %r414, %r413, 31;
- add.s32 %r415, %r413, %r414;
- shr.s32 %r416, %r415, 1;
- add.s32 %r417, %r3, %r416;
- add.s32 %r418, %r417, -1;
- div.s32 %r419, %r418, %r3;
- setp.ge.s32 %p137, %r74, %r419;
+ add.s32 %r412, %r177, 1;
+ shr.u32 %r413, %r412, 31;
+ add.s32 %r414, %r412, %r413;
+ shr.s32 %r415, %r414, 1;
+ add.s32 %r416, %r4, %r415;
+ add.s32 %r417, %r416, -1;
+ div.s32 %r418, %r417, %r4;
+ setp.ge.s32 %p137, %r74, %r418;
@%p137 bra $L__BB0_206;
- shl.b32 %r122, %r7, 1;
- mul.lo.s32 %r420, %r3, %r74;
- shl.b32 %r123, %r420, 1;
- add.s32 %r421, %r122, %r123;
- or.b32 %r422, %r421, 1;
- setp.ge.s32 %p138, %r422, %r177;
+ shl.b32 %r122, %r8, 1;
+ mul.lo.s32 %r419, %r4, %r74;
+ shl.b32 %r123, %r419, 1;
+ add.s32 %r420, %r122, %r123;
+ or.b32 %r421, %r420, 1;
+ setp.ge.s32 %p138, %r421, %r177;
@%p138 bra $L__BB0_206;
ld.param.u64 %rd156, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S1_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5];
- add.s32 %r425, %r123, %r122;
- mul.wide.s32 %rd155, %r425, 4;
+ add.s32 %r424, %r123, %r122;
+ mul.wide.s32 %rd155, %r424, 4;
add.s64 %rd154, %rd156, %rd155;
- st.global.cs.v2.s32 [%rd154], {%r468,%r470};
+ st.global.cs.v2.s32 [%rd154], {%r466,%r468};
$L__BB0_206:
ret;
Kernel 4
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 1, 1> T23, Tensor<float, 1, 1> T41, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T27, Tensor<float, 2, 2> T55, Tensor<float, 2, 2> T60, Tensor<int64_t, 1, 1> T65) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T38 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T37 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T34 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T38) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T55
// Allocate global tensor T60
__syncthreads();
Array<float, 4, 4> T56;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T56[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T61;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T61[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T54;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T59;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T59[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
} else {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
}
Array<float, 1, 1> T35;
T35[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T36;
T36[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T36[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T53;
T53[0] = 0.000000000e+00f;
Array<float, 1, 1> T64;
T64[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T49;
T49.set(float(0));
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
T59[i9]
= T59[i9]
+ T21[0];
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
T53[0]
= T53[0]
+ T9[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T64[0]
= T64[0]
+ T13[0];
}
} else {
Array<float, 4, 4> T52;
T52.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T59[i9]
= T59[i9]
+ T21[0];
}
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T53[0]
= T53[0]
+ T9[0];
}
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T64[0]
= T64[0]
+ T13[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T53[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T64[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
} else {
Array<float, 4, 4> T50;
T50.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T56[i6], T54[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T61[i7], T59[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T55[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T56[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T60[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T61[0]);
}
}
// Allocate global tensor T65
grid_sync::sync<false, true, false, true, true>(T65[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T58;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T58[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T57;
T57.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T57[0], &*(volatile float*)&T55[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T58[i12]
= T58[i12]
+ T57[i12];
}
}
Array<float, 2, 2> T40;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T40[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T40[i14], T58[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T40[0]);
}
Array<float, 2, 2> T39;
T39.set(float(0));
if (((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 2, 1> T63;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T63[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T62;
T62.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T62[0], &*(volatile float*)&T60[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T63[i15]
= T63[i15]
+ T62[i15];
}
}
Array<float, 2, 2> T42;
Array<float, 2, 2> T44;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
Array<float, 1, 1> T22;
T22[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T22[0], T63[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T42[i17]
= T22[0];
T44[i17]
= T22[0]
+ T39[i17];
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T41[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T42[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T27[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T44[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 1, 1> T23, Tensor<float, 1, 1> T41, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T27, Tensor<float, 2, 2> T55, Tensor<float, 2, 2> T60, Tensor<int64_t, 1, 1> T65) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T38 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T37 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T34 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T38) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T55
// Allocate global tensor T60
__syncthreads();
Array<float, 4, 4> T56;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T56[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T61;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T61[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T54;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T59;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T59[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
} else {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
}
Array<float, 1, 1> T35;
T35[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T36;
T36[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T36[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T53;
T53[0] = 0.000000000e+00f;
Array<float, 1, 1> T64;
T64[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T49;
T49.set(float(0));
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
T59[i9]
= T59[i9]
+ T21[0];
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
T53[0]
= T53[0]
+ T9[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T64[0]
= T64[0]
+ T13[0];
}
} else {
Array<float, 4, 4> T52;
T52.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T59[i9]
= T59[i9]
+ T21[0];
}
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T53[0]
= T53[0]
+ T9[0];
}
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T64[0]
= T64[0]
+ T13[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T53[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T64[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
} else {
Array<float, 4, 4> T50;
T50.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T56[i6], T54[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T61[i7], T59[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T55[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T56[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T60[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T61[0]);
}
}
// Allocate global tensor T65
grid_sync::sync<false, true, false, true, true>(T65[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T58;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T58[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T57;
T57.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T57[0], &*(volatile float*)&T55[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T58[i12]
= T58[i12]
+ T57[i12];
}
}
Array<float, 2, 2> T40;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T40[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T40[i14], T58[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T40[0]);
}
Array<float, 2, 2> T39;
T39.set(float(0));
if (((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 2, 1> T63;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T63[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T62;
T62.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T62[0], &*(volatile float*)&T60[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T63[i15]
= T63[i15]
+ T62[i15];
}
}
Array<float, 2, 2> T42;
Array<float, 2, 2> T44;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
Array<float, 1, 1> T22;
T22[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T22[0], T63[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T42[i17]
= T22[0];
T44[i17]
= T22[0]
+ T39[i17];
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T41[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T42[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T27[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T44[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
} else {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -152,17 +152,17 @@
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T48;
T48.set(float(0));
- loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T49;
T49.set(float(0));
- loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T46;
T46.set(float(0));
- loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
@@ -203,21 +203,21 @@
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T50;
T50.set(float(0));
- loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
- loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
} else {
Array<float, 4, 4> T50;
T50.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12[16]
)
{
.reg .pred %p<146>;
.reg .f32 %f<406>;
.reg .b32 %r<462>;
.reg .f64 %fd<3>;
.reg .b64 %rd<163>;
ld.param.v2.u32 {%r169, %r170}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
ld.param.v2.u32 {%r173, %r174}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r175, %r176}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+8];
ld.param.u64 %rd43, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r203, %r176, 3;
shr.s32 %r204, %r203, 31;
shr.u32 %r205, %r204, 30;
add.s32 %r206, %r203, %r205;
shr.s32 %r207, %r206, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r208, %r207, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r209, %r3, 2;
mad.lo.s32 %r210, %r209, %r208, 15;
and.b32 %r211, %r210, -16;
cvt.u64.u32 %rd1, %r211;
mul.lo.s32 %r212, %r3, %r207;
shl.b32 %r213, %r212, 4;
or.b32 %r214, %r213, 15;
and.b32 %r4, %r214, -16;
add.s32 %r215, %r214, %r4;
and.b32 %r216, %r215, -16;
cvt.s64.s32 %rd2, %r216;
mov.u64 %rd45, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_103395arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p9, %r5, %r207;
shl.b32 %r6, %r5, 2;
or.b32 %r217, %r6, 3;
setp.lt.s32 %p10, %r217, %r176;
and.pred %p1, %p10, %p9;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p11, %r7, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r218, smem_ptr; }
// end inline asm
shl.b32 %r221, %r5, 4;
add.s32 %r219, %r218, %r221;
mul.wide.s32 %rd49, %r6, 4;
add.s64 %rd48, %rd36, %rd49;
mov.u32 %r220, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r220, 0;
cp.async.ca.shared.global [%r219], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r222, %r3, 63;
div.s32 %r223, %r222, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r224, %r8, %r223;
add.s32 %r225, %r224, -1;
div.s32 %r9, %r225, %r8;
setp.gt.s32 %p13, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r176;
cvt.s64.s32 %rd50, %r4;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r227, %ctaid.y;
mul.lo.s32 %r228, %r9, %r3;
mul.lo.s32 %r10, %r228, %r227;
shl.b32 %r229, %r7, 2;
shl.b32 %r230, %r5, 4;
mad.lo.s32 %r11, %r229, %r176, %r230;
mul.lo.s32 %r231, %r176, %r7;
cvt.s64.s32 %rd54, %r231;
cvt.s64.s32 %rd55, %r6;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r232, %r10, %r176;
cvt.s64.s32 %rd6, %r232;
mul.lo.s32 %r12, %r176, %r3;
mul.lo.s32 %r13, %r9, %r227;
add.s32 %r14, %r231, %r6;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r14, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r233, %tid.z;
mad.lo.s32 %r234, %r3, %r233, %r7;
mad.lo.s32 %r15, %r234, %r2, %r5;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r235, %r2;
mov.u32 %r236, 31;
sub.s32 %r237, %r236, %r235;
mov.u32 %r238, 1;
shl.b32 %r16, %r238, %r237;
setp.lt.u32 %p14, %r5, %r16;
add.s32 %r239, %r16, %r5;
setp.lt.u32 %p15, %r239, %r2;
and.pred %p3, %p14, %p15;
add.s32 %r240, %r15, %r16;
mul.wide.s32 %rd59, %r240, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r241, %r16, 31;
add.s32 %r242, %r16, %r241;
shr.s32 %r17, %r242, 1;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r6, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r243, %r15, 1;
mul.wide.u32 %rd62, %r243, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
mul.wide.s32 %rd63, %r234, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd33;
cvta.to.global.u64 %rd16, %rd34;
add.s64 %rd19, %rd46, %rd51;
mov.u32 %r422, 0;
mov.f32 %f368, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r246, smem_ptr; }
// end inline asm
add.s32 %r247, %r11, %r246;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r272, smem_ptr; }
// end inline asm
add.s32 %r273, %r11, %r272;
not.pred %p26, %p3;
mov.f32 %f369, %f368;
mov.f32 %f370, %f368;
mov.f32 %f371, %f368;
mov.f32 %f380, %f368;
mov.f32 %f381, %f368;
mov.f32 %f382, %f368;
mov.f32 %f383, %f368;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r244, %r422, %r3, %r7;
add.s32 %r245, %r244, %r10;
setp.gt.s32 %p17, %r245, 63;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r249, %r12, %r422;
cvt.s64.s32 %rd67, %r249;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd35, %rd70;
mov.u32 %r248, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r248, 0;
cp.async.ca.shared.global [%r247], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r250, %r13, %r422;
mad.lo.s32 %r251, %r250, %r3, %r7;
setp.lt.s32 %p19, %r251, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r260, %r13, %r422;
mad.lo.s32 %r261, %r260, %r3, %r7;
setp.gt.s32 %p20, %r261, 63;
mov.u32 %r423, 0;
mov.u32 %r424, %r423;
mov.u32 %r425, %r423;
mov.u32 %r426, %r423;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r423, 0;
mov.u32 %r424, %r423;
mov.u32 %r425, %r423;
mov.u32 %r426, %r423;
$L__BB0_15:
add.s32 %r270, %r13, %r422;
mad.lo.s32 %r33, %r270, %r3, %r7;
mov.b32 %f117, %r426;
add.f32 %f383, %f383, %f117;
mov.b32 %f118, %r425;
add.f32 %f382, %f382, %f118;
mov.b32 %f119, %r424;
add.f32 %f381, %f381, %f119;
mov.b32 %f120, %r423;
add.f32 %f380, %f380, %f120;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f366, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r271, %r33, %r169;
mul.wide.s32 %rd71, %r271, 4;
add.s64 %rd72, %rd15, %rd71;
ld.global.f32 %f366, [%rd72];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r275, %r12, %r422;
cvt.s64.s32 %rd75, %r275;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd32, %rd78;
mov.u32 %r274, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r274, 0;
cp.async.ca.shared.global [%r273], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r421, %r13, %r422;
mad.lo.s32 %r420, %r421, %r3, %r7;
setp.gt.s32 %p145, %r420, 63;
mov.f32 %f372, 0f00000000;
mov.f32 %f367, %f372;
@%p145 bra $L__BB0_21;
mul.lo.s32 %r276, %r33, %r173;
mul.wide.s32 %rd79, %r276, 4;
add.s64 %rd80, %rd16, %rd79;
ld.global.f32 %f367, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f373, %f372;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd12];
sub.f32 %f129, %f124, %f366;
mul.f32 %f130, %f367, %f129;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd7];
fma.rn.f32 %f368, %f130, %f131, %f368;
ld.shared.v4.f32 {%f136, %f137, %f138, %f139}, [%rd10];
mul.f32 %f141, %f136, %f131;
add.f32 %f142, %f141, 0f00000000;
fma.rn.f32 %f143, %f130, %f141, 0f00000000;
sub.f32 %f145, %f125, %f366;
mul.f32 %f146, %f367, %f145;
fma.rn.f32 %f369, %f146, %f132, %f369;
mul.f32 %f149, %f137, %f132;
add.f32 %f150, %f142, %f149;
fma.rn.f32 %f151, %f146, %f149, %f143;
sub.f32 %f153, %f126, %f366;
mul.f32 %f154, %f367, %f153;
fma.rn.f32 %f370, %f154, %f133, %f370;
mul.f32 %f157, %f138, %f133;
add.f32 %f158, %f150, %f157;
fma.rn.f32 %f159, %f154, %f157, %f151;
sub.f32 %f161, %f127, %f366;
mul.f32 %f162, %f367, %f161;
fma.rn.f32 %f371, %f162, %f134, %f371;
mul.f32 %f165, %f139, %f134;
add.f32 %f373, %f158, %f165;
fma.rn.f32 %f372, %f162, %f165, %f159;
$L__BB0_23:
st.shared.f32 [%rd8], %f373;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f166, [%rd9];
ld.shared.f32 %f167, [%rd8];
add.f32 %f168, %f166, %f167;
st.shared.f32 [%rd8], %f168;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r427, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r5, %r427;
@%p28 bra $L__BB0_29;
add.s32 %r277, %r427, %r15;
mul.wide.s32 %rd81, %r277, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f169, [%rd8];
ld.shared.f32 %f170, [%rd83];
add.f32 %f171, %f170, %f169;
st.shared.f32 [%rd8], %f171;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r427, 1;
setp.gt.u32 %p29, %r427, 3;
mov.u32 %r427, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r5, 0;
mov.f32 %f374, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r2, 2;
ld.shared.f32 %f173, [%rd8];
add.f32 %f374, %f173, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f174, [%rd11];
add.f32 %f374, %f374, %f174;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f372;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f175, [%rd9];
ld.shared.f32 %f176, [%rd8];
add.f32 %f177, %f175, %f176;
st.shared.f32 [%rd8], %f177;
$L__BB0_35:
setp.lt.s32 %p142, %r16, 4;
bar.sync 0;
@%p142 bra $L__BB0_40;
mov.u32 %r428, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r5, %r428;
@%p34 bra $L__BB0_39;
add.s32 %r278, %r428, %r15;
mul.wide.s32 %rd84, %r278, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f178, [%rd8];
ld.shared.f32 %f179, [%rd86];
add.f32 %f180, %f179, %f178;
st.shared.f32 [%rd8], %f180;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r428, 1;
setp.gt.u32 %p35, %r428, 3;
mov.u32 %r428, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f375, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r2, 2;
ld.shared.f32 %f182, [%rd8];
add.f32 %f375, %f182, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f183, [%rd11];
add.f32 %f375, %f375, %f183;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f374;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f375;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f184, %f367, %f1;
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd10];
ld.shared.v4.f32 {%f190, %f191, %f192, %f193}, [%rd7];
mul.f32 %f195, %f185, %f190;
mul.f32 %f196, %f195, %f2;
ld.shared.v4.f32 {%f197, %f198, %f199, %f200}, [%rd12];
sub.f32 %f202, %f197, %f366;
mul.f32 %f203, %f367, %f202;
sub.f32 %f204, %f196, %f37;
mul.f32 %f205, %f38, %f203;
sub.f32 %f206, %f204, %f205;
mul.f32 %f207, %f184, %f206;
mov.b32 %r279, %f207;
mul.f32 %f210, %f186, %f191;
mul.f32 %f211, %f210, %f2;
sub.f32 %f213, %f198, %f366;
mul.f32 %f214, %f367, %f213;
sub.f32 %f215, %f211, %f37;
mul.f32 %f216, %f38, %f214;
sub.f32 %f217, %f215, %f216;
mul.f32 %f218, %f184, %f217;
mov.b32 %r280, %f218;
mul.f32 %f221, %f187, %f192;
mul.f32 %f222, %f221, %f2;
sub.f32 %f224, %f199, %f366;
mul.f32 %f225, %f367, %f224;
sub.f32 %f226, %f222, %f37;
mul.f32 %f227, %f38, %f225;
sub.f32 %f228, %f226, %f227;
mul.f32 %f229, %f184, %f228;
mov.b32 %r281, %f229;
mul.f32 %f232, %f188, %f193;
mul.f32 %f233, %f232, %f2;
sub.f32 %f235, %f200, %f366;
mul.f32 %f236, %f367, %f235;
sub.f32 %f237, %f233, %f37;
mul.f32 %f238, %f38, %f236;
sub.f32 %f239, %f237, %f238;
mul.f32 %f240, %f184, %f239;
mov.b32 %r282, %f240;
mad.lo.s32 %r283, %r422, %r3, %r10;
mad.lo.s32 %r284, %r283, %r176, %r14;
mul.wide.s32 %rd88, %r284, 4;
add.s64 %rd87, %rd40, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r279,%r280,%r281,%r282};
// end inline asm
$L__BB0_49:
add.s32 %r422, %r422, 1;
setp.lt.s32 %p41, %r422, %r9;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f368, 0f00000000;
mov.f32 %f369, %f368;
mov.f32 %f370, %f368;
mov.f32 %f371, %f368;
mov.f32 %f380, %f368;
mov.f32 %f381, %f368;
mov.f32 %f382, %f368;
mov.f32 %f383, %f368;
$L__BB0_50:
mov.u32 %r285, %tid.z;
mad.lo.s32 %r286, %r3, %r285, %r7;
mad.lo.s32 %r39, %r286, %r2, %r5;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
clz.b32 %r287, %r3;
mov.u32 %r288, 31;
sub.s32 %r289, %r288, %r287;
mov.u32 %r290, 1;
shl.b32 %r40, %r290, %r289;
setp.lt.u32 %p42, %r7, %r40;
add.s32 %r291, %r40, %r7;
setp.lt.u32 %p43, %r291, %r3;
and.pred %p5, %p42, %p43;
shl.b32 %r292, %r2, %r289;
add.s32 %r293, %r39, %r292;
mul.wide.s32 %rd91, %r293, 4;
add.s64 %rd24, %rd45, %rd91;
shr.u32 %r294, %r40, 31;
add.s32 %r295, %r40, %r294;
shr.s32 %r443, %r295, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f241, [%rd24];
ld.shared.f32 %f242, [%rd23];
add.f32 %f243, %f241, %f242;
st.shared.f32 [%rd23], %f243;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r429, %r443;
$L__BB0_54:
setp.ge.u32 %p46, %r7, %r429;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r296, %r429, %r2, %r39;
mul.wide.s32 %rd92, %r296, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f244, [%rd23];
ld.shared.f32 %f245, [%rd94];
add.f32 %f246, %f245, %f244;
st.shared.f32 [%rd23], %f246;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r429, 1;
setp.gt.u32 %p47, %r429, 3;
mov.u32 %r429, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r298, %r39, %r2;
mul.wide.u32 %rd95, %r298, 4;
add.s64 %rd25, %rd45, %rd95;
setp.ne.s32 %p48, %r7, 0;
mov.u32 %r430, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r3, 2;
ld.shared.f32 %f247, [%rd23];
add.f32 %f384, %f247, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f248, [%rd25];
add.f32 %f384, %f384, %f248;
$L__BB0_60:
mov.b32 %r430, %f384;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f249, [%rd24];
ld.shared.f32 %f250, [%rd23];
add.f32 %f251, %f249, %f250;
st.shared.f32 [%rd23], %f251;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r431, %r443;
$L__BB0_65:
setp.ge.u32 %p52, %r7, %r431;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r299, %r431, %r2, %r39;
mul.wide.s32 %rd97, %r299, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f252, [%rd23];
ld.shared.f32 %f253, [%rd99];
add.f32 %f254, %f253, %f252;
st.shared.f32 [%rd23], %f254;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r431, 1;
setp.gt.u32 %p53, %r431, 3;
mov.u32 %r431, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r432, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r3, 2;
ld.shared.f32 %f255, [%rd23];
add.f32 %f385, %f255, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f256, [%rd25];
add.f32 %f385, %f385, %f256;
$L__BB0_71:
mov.b32 %r432, %f385;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f382;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f257, [%rd24];
ld.shared.f32 %f258, [%rd23];
add.f32 %f259, %f257, %f258;
st.shared.f32 [%rd23], %f259;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r433, %r443;
$L__BB0_76:
setp.ge.u32 %p58, %r7, %r433;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r301, %r433, %r2, %r39;
mul.wide.s32 %rd100, %r301, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f260, [%rd23];
ld.shared.f32 %f261, [%rd102];
add.f32 %f262, %f261, %f260;
st.shared.f32 [%rd23], %f262;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r433, 1;
setp.gt.u32 %p59, %r433, 3;
mov.u32 %r433, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r434, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r3, 2;
ld.shared.f32 %f263, [%rd23];
add.f32 %f386, %f263, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f264, [%rd25];
add.f32 %f386, %f386, %f264;
$L__BB0_82:
mov.b32 %r434, %f386;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f383;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f265, [%rd24];
ld.shared.f32 %f266, [%rd23];
add.f32 %f267, %f265, %f266;
st.shared.f32 [%rd23], %f267;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r435, %r443;
$L__BB0_87:
setp.ge.u32 %p64, %r7, %r435;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r303, %r435, %r2, %r39;
mul.wide.s32 %rd103, %r303, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f268, [%rd23];
ld.shared.f32 %f269, [%rd105];
add.f32 %f270, %f269, %f268;
st.shared.f32 [%rd23], %f270;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r435, 1;
setp.gt.u32 %p65, %r435, 3;
mov.u32 %r435, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r436, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r3, 2;
ld.shared.f32 %f271, [%rd23];
add.f32 %f387, %f271, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f272, [%rd25];
add.f32 %f387, %f387, %f272;
$L__BB0_93:
mov.b32 %r436, %f387;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f273, [%rd24];
ld.shared.f32 %f274, [%rd23];
add.f32 %f275, %f273, %f274;
st.shared.f32 [%rd23], %f275;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r437, %r443;
$L__BB0_98:
setp.ge.u32 %p70, %r7, %r437;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r305, %r437, %r2, %r39;
mul.wide.s32 %rd106, %r305, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f276, [%rd23];
ld.shared.f32 %f277, [%rd108];
add.f32 %f278, %f277, %f276;
st.shared.f32 [%rd23], %f278;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r437, 1;
setp.gt.u32 %p71, %r437, 3;
mov.u32 %r437, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r438, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r3, 2;
ld.shared.f32 %f279, [%rd23];
add.f32 %f388, %f279, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f280, [%rd25];
add.f32 %f388, %f388, %f280;
$L__BB0_104:
mov.b32 %r438, %f388;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f281, [%rd24];
ld.shared.f32 %f282, [%rd23];
add.f32 %f283, %f281, %f282;
st.shared.f32 [%rd23], %f283;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r439, %r443;
$L__BB0_109:
setp.ge.u32 %p76, %r7, %r439;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r307, %r439, %r2, %r39;
mul.wide.s32 %rd109, %r307, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f284, [%rd23];
ld.shared.f32 %f285, [%rd111];
add.f32 %f286, %f285, %f284;
st.shared.f32 [%rd23], %f286;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r439, 1;
setp.gt.u32 %p77, %r439, 3;
mov.u32 %r439, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r440, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r3, 2;
ld.shared.f32 %f287, [%rd23];
add.f32 %f389, %f287, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f288, [%rd25];
add.f32 %f389, %f389, %f288;
$L__BB0_115:
mov.b32 %r440, %f389;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f370;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f289, [%rd24];
ld.shared.f32 %f290, [%rd23];
add.f32 %f291, %f289, %f290;
st.shared.f32 [%rd23], %f291;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r441, %r443;
$L__BB0_120:
setp.ge.u32 %p82, %r7, %r441;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r309, %r441, %r2, %r39;
mul.wide.s32 %rd112, %r309, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f292, [%rd23];
ld.shared.f32 %f293, [%rd114];
add.f32 %f294, %f293, %f292;
st.shared.f32 [%rd23], %f294;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r441, 1;
setp.gt.u32 %p83, %r441, 3;
mov.u32 %r441, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r442, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f32 %f295, [%rd23];
add.f32 %f390, %f295, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f296, [%rd25];
add.f32 %f390, %f390, %f296;
$L__BB0_126:
mov.b32 %r442, %f390;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f371;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f297, [%rd24];
ld.shared.f32 %f298, [%rd23];
add.f32 %f299, %f297, %f298;
st.shared.f32 [%rd23], %f299;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r7, %r443;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r311, %r443, %r2, %r39;
mul.wide.s32 %rd115, %r311, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f300, [%rd23];
ld.shared.f32 %f301, [%rd117];
add.f32 %f302, %f301, %f300;
st.shared.f32 [%rd23], %f302;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r443, 1;
setp.gt.u32 %p89, %r443, 3;
mov.u32 %r443, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r444, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f32 %f303, [%rd23];
add.f32 %f391, %f303, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f304, [%rd25];
add.f32 %f391, %f391, %f304;
$L__BB0_136:
mov.b32 %r444, %f391;
$L__BB0_137:
setp.eq.s32 %p144, %r7, 0;
and.pred %p143, %p144, %p1;
bar.sync 0;
@%p143 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
shl.b32 %r419, %r5, 2;
mov.u32 %r321, %ctaid.y;
mad.lo.s32 %r322, %r176, %r321, %r419;
mul.wide.s32 %rd120, %r322, 4;
add.s64 %rd118, %rd42, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd118], {%r430,%r432,%r434,%r436};
// end inline asm
add.s64 %rd119, %rd43, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r438,%r440,%r442,%r444};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r323, %r5, %r7;
or.b32 %r325, %r323, %r285;
setp.ne.s32 %p92, %r325, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd161, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12];
cvta.to.global.u64 %rd121, %rd161;
mov.u32 %r326, %ctaid.x;
mov.u32 %r327, %ctaid.z;
mov.u32 %r328, %nctaid.x;
mad.lo.s32 %r329, %r327, %r328, %r326;
mul.wide.s32 %rd122, %r329, 8;
add.s64 %rd28, %rd121, %rd122;
add.s32 %r330, %r8, -1;
setp.eq.s32 %p93, %r74, %r330;
cvt.s64.s32 %rd123, %r8;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p94, %rd128, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r445, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r445;
// end inline asm
setp.lt.u32 %p95, %r445, 256;
selp.u32 %r333, 1, 0, %p95;
shl.b32 %r445, %r445, %r333;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p96, %rd130, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r334, %r8, %r2;
add.s32 %r335, %r334, -1;
div.s32 %r77, %r335, %r2;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f394, 0f00000000;
mov.f32 %f395, %f394;
@%p97 bra $L__BB0_149;
add.s32 %r337, %r176, 1;
shr.u32 %r338, %r337, 31;
add.s32 %r339, %r337, %r338;
shr.s32 %r340, %r339, 1;
add.s32 %r341, %r3, %r340;
add.s32 %r342, %r341, -1;
shl.b32 %r343, %r7, 1;
shl.b32 %r344, %r3, 1;
mad.lo.s32 %r345, %r344, %r74, %r343;
or.b32 %r346, %r345, 1;
setp.ge.s32 %p98, %r346, %r176;
div.s32 %r347, %r342, %r3;
setp.ge.s32 %p99, %r74, %r347;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r348, %r3, %r74;
shl.b32 %r349, %r348, 1;
mad.lo.s32 %r350, %r176, %r5, %r349;
add.s32 %r447, %r350, %r343;
mul.lo.s32 %r79, %r176, %r2;
mov.u32 %r336, 0;
mov.f32 %f394, 0f00000000;
mov.u32 %r446, %r5;
mov.u32 %r448, %r336;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r449, %r336;
mov.u32 %r450, %r336;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r446, %r8;
mov.u32 %r449, %r336;
mov.u32 %r450, %r336;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd132, %r447, 4;
add.s64 %rd131, %rd42, %rd132;
// begin inline asm
ld.volatile.global.v2.s32 {%r450,%r449}, [%rd131];
// end inline asm
$L__BB0_148:
mov.b32 %f309, %r450;
add.f32 %f394, %f394, %f309;
mov.b32 %f310, %r449;
add.f32 %f395, %f395, %f310;
add.s32 %r447, %r447, %r79;
add.s32 %r446, %r446, %r2;
add.s32 %r448, %r448, 1;
setp.lt.s32 %p101, %r448, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r357, %r2;
mov.u32 %r358, 31;
sub.s32 %r359, %r358, %r357;
mov.u32 %r360, 1;
shl.b32 %r90, %r360, %r359;
setp.lt.u32 %p102, %r5, %r90;
add.s32 %r361, %r90, %r5;
setp.lt.u32 %p103, %r361, %r2;
and.pred %p7, %p102, %p103;
add.s32 %r362, %r39, %r90;
mul.wide.s32 %rd133, %r362, 4;
add.s64 %rd30, %rd45, %rd133;
shr.u32 %r363, %r90, 31;
add.s32 %r364, %r90, %r363;
shr.s32 %r461, %r364, 1;
st.shared.f32 [%rd23], %f394;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f311, [%rd30];
ld.shared.f32 %f312, [%rd23];
add.f32 %f313, %f311, %f312;
st.shared.f32 [%rd23], %f313;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r451, %r461;
$L__BB0_153:
setp.ge.u32 %p106, %r5, %r451;
@%p106 bra $L__BB0_155;
add.s32 %r365, %r451, %r39;
mul.wide.s32 %rd135, %r365, 4;
add.s64 %rd137, %rd45, %rd135;
ld.shared.f32 %f314, [%rd23];
ld.shared.f32 %f315, [%rd137];
add.f32 %f316, %f315, %f314;
st.shared.f32 [%rd23], %f316;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r451, 1;
setp.gt.u32 %p107, %r451, 3;
mov.u32 %r451, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r367, %r39, 1;
mul.wide.u32 %rd138, %r367, 4;
add.s64 %rd31, %rd45, %rd138;
setp.ne.s32 %p108, %r5, 0;
mov.u32 %r452, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r2, 2;
ld.shared.f32 %f317, [%rd23];
add.f32 %f396, %f317, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f318, [%rd31];
add.f32 %f396, %f396, %f318;
$L__BB0_159:
mov.b32 %r452, %f396;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f395;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f319, [%rd30];
ld.shared.f32 %f320, [%rd23];
add.f32 %f321, %f319, %f320;
st.shared.f32 [%rd23], %f321;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r453, %r461;
$L__BB0_164:
setp.ge.u32 %p112, %r5, %r453;
@%p112 bra $L__BB0_166;
add.s32 %r368, %r453, %r39;
mul.wide.s32 %rd140, %r368, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f322, [%rd23];
ld.shared.f32 %f323, [%rd142];
add.f32 %f324, %f323, %f322;
st.shared.f32 [%rd23], %f324;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r453, 1;
setp.gt.u32 %p113, %r453, 3;
mov.u32 %r453, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r454, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r2, 2;
ld.shared.f32 %f325, [%rd23];
add.f32 %f397, %f325, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f326, [%rd31];
add.f32 %f397, %f397, %f326;
$L__BB0_170:
mov.b32 %r454, %f397;
$L__BB0_171:
bar.sync 0;
setp.eq.s32 %p116, %r5, 0;
@%p116 bra $L__BB0_172;
bra.uni $L__BB0_175;
$L__BB0_172:
add.s32 %r370, %r176, 1;
shr.u32 %r371, %r370, 31;
add.s32 %r372, %r370, %r371;
shr.s32 %r373, %r372, 1;
add.s32 %r374, %r3, %r373;
add.s32 %r375, %r374, -1;
div.s32 %r376, %r375, %r3;
setp.ge.s32 %p117, %r74, %r376;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r7, 1;
mul.lo.s32 %r377, %r3, %r74;
shl.b32 %r101, %r377, 1;
add.s32 %r378, %r100, %r101;
or.b32 %r379, %r378, 1;
setp.ge.s32 %p118, %r379, %r176;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd160, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
add.s32 %r382, %r101, %r100;
mul.wide.s32 %rd144, %r382, 4;
add.s64 %rd143, %rd160, %rd144;
// begin inline asm
st.global.cs.v2.s32 [%rd143], {%r452,%r454};
// end inline asm
$L__BB0_175:
add.s32 %r383, %r176, 1;
shr.u32 %r384, %r383, 31;
add.s32 %r385, %r383, %r384;
shr.s32 %r386, %r385, 1;
add.s32 %r387, %r3, %r386;
add.s32 %r388, %r387, -1;
div.s32 %r102, %r388, %r3;
setp.ge.s32 %p119, %r74, %r102;
mov.f32 %f398, 0f00000000;
mov.f32 %f402, 0f00000000;
mov.f32 %f399, %f402;
@%p119 bra $L__BB0_178;
shl.b32 %r103, %r7, 1;
mul.lo.s32 %r389, %r3, %r74;
shl.b32 %r104, %r389, 1;
add.s32 %r390, %r103, %r104;
or.b32 %r391, %r390, 1;
setp.ge.s32 %p120, %r391, %r176;
@%p120 bra $L__BB0_178;
ld.param.u64 %rd159, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5];
add.s32 %r394, %r104, %r103;
mul.wide.s32 %rd146, %r394, 4;
add.s64 %rd145, %rd159, %rd146;
// begin inline asm
ld.global.cs.v2.u32 {%r392,%r393}, [%rd145];
// end inline asm
mov.b32 %f398, %r392;
mov.b32 %f399, %r393;
$L__BB0_178:
mov.f32 %f403, %f402;
@%p97 bra $L__BB0_184;
shl.b32 %r396, %r7, 1;
shl.b32 %r397, %r3, 1;
mad.lo.s32 %r398, %r397, %r74, %r396;
or.b32 %r399, %r398, 1;
setp.ge.s32 %p122, %r399, %r176;
or.pred %p8, %p122, %p119;
mul.lo.s32 %r400, %r3, %r74;
shl.b32 %r401, %r400, 1;
mad.lo.s32 %r402, %r176, %r5, %r401;
add.s32 %r456, %r402, %r396;
mul.lo.s32 %r106, %r176, %r2;
mov.u32 %r395, 0;
mov.f32 %f402, 0f00000000;
mov.u32 %r455, %r5;
mov.f32 %f403, %f402;
mov.u32 %r457, %r395;
$L__BB0_180:
.pragma "nounroll";
mov.u32 %r458, %r395;
mov.u32 %r459, %r395;
@%p8 bra $L__BB0_183;
setp.ge.s32 %p124, %r455, %r8;
mov.u32 %r458, %r395;
mov.u32 %r459, %r395;
@%p124 bra $L__BB0_183;
mul.wide.s32 %rd148, %r456, 4;
add.s64 %rd147, %rd43, %rd148;
// begin inline asm
ld.volatile.global.v2.s32 {%r459,%r458}, [%rd147];
// end inline asm
$L__BB0_183:
mov.b32 %f335, %r459;
add.f32 %f402, %f402, %f335;
mov.b32 %f336, %r458;
add.f32 %f403, %f403, %f336;
add.s32 %r456, %r456, %r106;
add.s32 %r455, %r455, %r2;
add.s32 %r457, %r457, 1;
setp.lt.s32 %p125, %r457, %r77;
@%p125 bra $L__BB0_180;
$L__BB0_184:
st.shared.f32 [%rd23], %f402;
bar.sync 0;
@%p104 bra $L__BB0_186;
ld.shared.f32 %f337, [%rd30];
ld.shared.f32 %f338, [%rd23];
add.f32 %f339, %f337, %f338;
st.shared.f32 [%rd23], %f339;
$L__BB0_186:
bar.sync 0;
@%p105 bra $L__BB0_191;
mov.u32 %r460, %r461;
$L__BB0_188:
setp.ge.u32 %p128, %r5, %r460;
@%p128 bra $L__BB0_190;
add.s32 %r409, %r460, %r39;
mul.wide.s32 %rd149, %r409, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f340, [%rd23];
ld.shared.f32 %f341, [%rd151];
add.f32 %f342, %f341, %f340;
st.shared.f32 [%rd23], %f342;
$L__BB0_190:
bar.sync 0;
shr.u32 %r118, %r460, 1;
setp.gt.u32 %p129, %r460, 3;
mov.u32 %r460, %r118;
@%p129 bra $L__BB0_188;
$L__BB0_191:
mov.f32 %f404, 0f00000000;
@%p108 bra $L__BB0_194;
setp.lt.u32 %p131, %r2, 2;
ld.shared.f32 %f344, [%rd23];
add.f32 %f404, %f344, 0f00000000;
@%p131 bra $L__BB0_194;
ld.shared.f32 %f345, [%rd31];
add.f32 %f404, %f404, %f345;
$L__BB0_194:
bar.sync 0;
st.shared.f32 [%rd23], %f403;
bar.sync 0;
@%p104 bra $L__BB0_196;
ld.shared.f32 %f346, [%rd30];
ld.shared.f32 %f347, [%rd23];
add.f32 %f348, %f346, %f347;
st.shared.f32 [%rd23], %f348;
$L__BB0_196:
bar.sync 0;
@%p105 bra $L__BB0_200;
$L__BB0_197:
setp.ge.u32 %p134, %r5, %r461;
@%p134 bra $L__BB0_199;
add.s32 %r410, %r461, %r39;
mul.wide.s32 %rd152, %r410, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f349, [%rd23];
ld.shared.f32 %f350, [%rd154];
add.f32 %f351, %f350, %f349;
st.shared.f32 [%rd23], %f351;
$L__BB0_199:
bar.sync 0;
shr.u32 %r120, %r461, 1;
setp.gt.u32 %p135, %r461, 3;
mov.u32 %r461, %r120;
@%p135 bra $L__BB0_197;
$L__BB0_200:
add.f32 %f96, %f404, %f398;
mov.f32 %f405, 0f00000000;
@%p108 bra $L__BB0_203;
setp.lt.u32 %p137, %r2, 2;
ld.shared.f32 %f353, [%rd23];
add.f32 %f405, %f353, 0f00000000;
@%p137 bra $L__BB0_203;
ld.shared.f32 %f354, [%rd31];
add.f32 %f405, %f405, %f354;
$L__BB0_203:
bar.sync 0;
or.pred %p140, %p108, %p119;
@%p140 bra $L__BB0_206;
shl.b32 %r121, %r7, 1;
mul.lo.s32 %r411, %r3, %r74;
shl.b32 %r122, %r411, 1;
add.s32 %r412, %r121, %r122;
or.b32 %r413, %r412, 1;
setp.ge.s32 %p141, %r413, %r176;
@%p141 bra $L__BB0_206;
ld.param.u64 %rd162, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd158, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_0ab5d412_1033910nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r418, %r122, %r121;
mul.wide.s32 %rd157, %r418, 4;
add.s64 %rd155, %rd158, %rd157;
mov.b32 %r415, %f405;
mov.b32 %r414, %f404;
// begin inline asm
st.global.cs.v2.s32 [%rd155], {%r414,%r415};
// end inline asm
add.s64 %rd156, %rd162, %rd157;
add.f32 %f355, %f405, %f399;
mov.b32 %r417, %f355;
mov.b32 %r416, %f96;
// begin inline asm
st.global.cs.v2.s32 [%rd156], {%r416,%r417};
// end inline asm
$L__BB0_206:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12[16]
)
{
.reg .pred %p<146>;
.reg .f32 %f<406>;
.reg .b32 %r<460>;
.reg .f64 %fd<3>;
.reg .b64 %rd<163>;
ld.param.v2.u32 {%r169, %r170}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
ld.param.v2.u32 {%r173, %r174}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r175, %r176}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+8];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r203, %r176, 3;
shr.s32 %r204, %r203, 31;
shr.u32 %r205, %r204, 30;
add.s32 %r206, %r203, %r205;
shr.s32 %r2, %r206, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r207, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r208, %r4, 2;
mad.lo.s32 %r209, %r208, %r207, 15;
and.b32 %r210, %r209, -16;
cvt.u64.u32 %rd1, %r210;
mul.lo.s32 %r211, %r4, %r2;
shl.b32 %r212, %r211, 4;
or.b32 %r213, %r212, 15;
and.b32 %r5, %r213, -16;
add.s32 %r214, %r213, %r5;
and.b32 %r215, %r214, -16;
cvt.s64.s32 %rd2, %r215;
mov.u64 %rd45, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_72335arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r216, %r7, 3;
setp.lt.s32 %p10, %r216, %r176;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r217, smem_ptr; }
// end inline asm
shl.b32 %r220, %r6, 4;
add.s32 %r218, %r217, %r220;
mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd36, %rd49;
mov.u32 %r219, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r219, 0;
cp.async.ca.shared.global [%r218], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r221, %r4, 63;
div.s32 %r222, %r221, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r223, %r9, %r222;
add.s32 %r224, %r223, -1;
div.s32 %r10, %r224, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r176;
cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r226, %ctaid.y;
mul.lo.s32 %r227, %r10, %r4;
mul.lo.s32 %r11, %r227, %r226;
mad.lo.s32 %r228, %r2, %r8, %r6;
shl.b32 %r12, %r228, 4;
mul.lo.s32 %r229, %r176, %r8;
cvt.s64.s32 %rd54, %r229;
cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r230, %r11, %r176;
cvt.s64.s32 %rd6, %r230;
mul.lo.s32 %r13, %r176, %r4;
mul.lo.s32 %r14, %r10, %r226;
shl.b32 %r231, %r8, 2;
mad.lo.s32 %r232, %r231, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r232, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r233, %tid.z;
mad.lo.s32 %r234, %r4, %r233, %r8;
mad.lo.s32 %r15, %r234, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r235, %r3;
mov.u32 %r236, 31;
sub.s32 %r237, %r236, %r235;
mov.u32 %r238, 1;
shl.b32 %r16, %r238, %r237;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r239, %r16, %r6;
setp.lt.u32 %p15, %r239, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r240, %r15, %r16;
mul.wide.s32 %rd59, %r240, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r241, %r16, 31;
add.s32 %r242, %r16, %r241;
shr.s32 %r17, %r242, 1;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r7, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r243, %r15, 1;
mul.wide.u32 %rd62, %r243, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
mul.wide.s32 %rd63, %r234, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd33;
cvta.to.global.u64 %rd16, %rd34;
add.s64 %rd19, %rd46, %rd51;
mov.u32 %r420, 0;
mov.f32 %f368, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r246, smem_ptr; }
// end inline asm
add.s32 %r247, %r246, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r272, smem_ptr; }
// end inline asm
add.s32 %r273, %r272, %r12;
not.pred %p26, %p3;
mov.f32 %f369, %f368;
mov.f32 %f370, %f368;
mov.f32 %f371, %f368;
mov.f32 %f380, %f368;
mov.f32 %f381, %f368;
mov.f32 %f382, %f368;
mov.f32 %f383, %f368;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r244, %r420, %r4, %r8;
add.s32 %r245, %r244, %r11;
setp.gt.s32 %p17, %r245, 63;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r249, %r13, %r420;
cvt.s64.s32 %rd67, %r249;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd35, %rd70;
mov.u32 %r248, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r248, 0;
cp.async.ca.shared.global [%r247], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r250, %r14, %r420;
mad.lo.s32 %r251, %r250, %r4, %r8;
setp.lt.s32 %p19, %r251, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r260, %r14, %r420;
mad.lo.s32 %r261, %r260, %r4, %r8;
setp.gt.s32 %p20, %r261, 63;
mov.u32 %r421, 0;
mov.u32 %r422, %r421;
mov.u32 %r423, %r421;
mov.u32 %r424, %r421;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r421, 0;
mov.u32 %r422, %r421;
mov.u32 %r423, %r421;
mov.u32 %r424, %r421;
$L__BB0_15:
add.s32 %r270, %r14, %r420;
mad.lo.s32 %r33, %r270, %r4, %r8;
mov.b32 %f117, %r424;
add.f32 %f383, %f383, %f117;
mov.b32 %f118, %r423;
add.f32 %f382, %f382, %f118;
mov.b32 %f119, %r422;
add.f32 %f381, %f381, %f119;
mov.b32 %f120, %r421;
add.f32 %f380, %f380, %f120;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f366, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r271, %r33, %r169;
mul.wide.s32 %rd71, %r271, 4;
add.s64 %rd72, %rd15, %rd71;
ld.global.f32 %f366, [%rd72];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r275, %r13, %r420;
cvt.s64.s32 %rd75, %r275;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd32, %rd78;
mov.u32 %r274, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r274, 0;
cp.async.ca.shared.global [%r273], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r419, %r14, %r420;
mad.lo.s32 %r418, %r419, %r4, %r8;
setp.gt.s32 %p145, %r418, 63;
mov.f32 %f372, 0f00000000;
mov.f32 %f367, %f372;
@%p145 bra $L__BB0_21;
mul.lo.s32 %r276, %r33, %r173;
mul.wide.s32 %rd79, %r276, 4;
add.s64 %rd80, %rd16, %rd79;
ld.global.f32 %f367, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f373, %f372;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd12];
sub.f32 %f129, %f124, %f366;
mul.f32 %f130, %f367, %f129;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd7];
fma.rn.f32 %f368, %f130, %f131, %f368;
ld.shared.v4.f32 {%f136, %f137, %f138, %f139}, [%rd10];
mul.f32 %f141, %f136, %f131;
add.f32 %f142, %f141, 0f00000000;
fma.rn.f32 %f143, %f130, %f141, 0f00000000;
sub.f32 %f145, %f125, %f366;
mul.f32 %f146, %f367, %f145;
fma.rn.f32 %f369, %f146, %f132, %f369;
mul.f32 %f149, %f137, %f132;
add.f32 %f150, %f142, %f149;
fma.rn.f32 %f151, %f146, %f149, %f143;
sub.f32 %f153, %f126, %f366;
mul.f32 %f154, %f367, %f153;
fma.rn.f32 %f370, %f154, %f133, %f370;
mul.f32 %f157, %f138, %f133;
add.f32 %f158, %f150, %f157;
fma.rn.f32 %f159, %f154, %f157, %f151;
sub.f32 %f161, %f127, %f366;
mul.f32 %f162, %f367, %f161;
fma.rn.f32 %f371, %f162, %f134, %f371;
mul.f32 %f165, %f139, %f134;
add.f32 %f373, %f158, %f165;
fma.rn.f32 %f372, %f162, %f165, %f159;
$L__BB0_23:
st.shared.f32 [%rd8], %f373;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f166, [%rd9];
ld.shared.f32 %f167, [%rd8];
add.f32 %f168, %f166, %f167;
st.shared.f32 [%rd8], %f168;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r425, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r6, %r425;
@%p28 bra $L__BB0_29;
add.s32 %r277, %r425, %r15;
mul.wide.s32 %rd81, %r277, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f169, [%rd8];
ld.shared.f32 %f170, [%rd83];
add.f32 %f171, %f170, %f169;
st.shared.f32 [%rd8], %f171;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r425, 1;
setp.gt.u32 %p29, %r425, 3;
mov.u32 %r425, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r6, 0;
mov.f32 %f374, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f173, [%rd8];
add.f32 %f374, %f173, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f174, [%rd11];
add.f32 %f374, %f374, %f174;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f372;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f175, [%rd9];
ld.shared.f32 %f176, [%rd8];
add.f32 %f177, %f175, %f176;
st.shared.f32 [%rd8], %f177;
$L__BB0_35:
setp.lt.s32 %p142, %r16, 4;
bar.sync 0;
@%p142 bra $L__BB0_40;
mov.u32 %r426, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r6, %r426;
@%p34 bra $L__BB0_39;
add.s32 %r278, %r426, %r15;
mul.wide.s32 %rd84, %r278, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f178, [%rd8];
ld.shared.f32 %f179, [%rd86];
add.f32 %f180, %f179, %f178;
st.shared.f32 [%rd8], %f180;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r426, 1;
setp.gt.u32 %p35, %r426, 3;
mov.u32 %r426, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f375, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f182, [%rd8];
add.f32 %f375, %f182, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f183, [%rd11];
add.f32 %f375, %f375, %f183;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f374;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f375;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f184, %f367, %f1;
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd10];
ld.shared.v4.f32 {%f190, %f191, %f192, %f193}, [%rd7];
mul.f32 %f195, %f185, %f190;
mul.f32 %f196, %f195, %f2;
ld.shared.v4.f32 {%f197, %f198, %f199, %f200}, [%rd12];
sub.f32 %f202, %f197, %f366;
mul.f32 %f203, %f367, %f202;
sub.f32 %f204, %f196, %f37;
mul.f32 %f205, %f38, %f203;
sub.f32 %f206, %f204, %f205;
mul.f32 %f207, %f184, %f206;
mov.b32 %r279, %f207;
mul.f32 %f210, %f186, %f191;
mul.f32 %f211, %f210, %f2;
sub.f32 %f213, %f198, %f366;
mul.f32 %f214, %f367, %f213;
sub.f32 %f215, %f211, %f37;
mul.f32 %f216, %f38, %f214;
sub.f32 %f217, %f215, %f216;
mul.f32 %f218, %f184, %f217;
mov.b32 %r280, %f218;
mul.f32 %f221, %f187, %f192;
mul.f32 %f222, %f221, %f2;
sub.f32 %f224, %f199, %f366;
mul.f32 %f225, %f367, %f224;
sub.f32 %f226, %f222, %f37;
mul.f32 %f227, %f38, %f225;
sub.f32 %f228, %f226, %f227;
mul.f32 %f229, %f184, %f228;
mov.b32 %r281, %f229;
mul.f32 %f232, %f188, %f193;
mul.f32 %f233, %f232, %f2;
sub.f32 %f235, %f200, %f366;
mul.f32 %f236, %f367, %f235;
sub.f32 %f237, %f233, %f37;
mul.f32 %f238, %f38, %f236;
sub.f32 %f239, %f237, %f238;
mul.f32 %f240, %f184, %f239;
mov.b32 %r282, %f240;
mad.lo.s32 %r283, %r33, %r176, %r7;
mul.wide.s32 %rd88, %r283, 4;
add.s64 %rd87, %rd40, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r279,%r280,%r281,%r282};
// end inline asm
$L__BB0_49:
add.s32 %r420, %r420, 1;
setp.lt.s32 %p41, %r420, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f368, 0f00000000;
mov.f32 %f369, %f368;
mov.f32 %f370, %f368;
mov.f32 %f371, %f368;
mov.f32 %f380, %f368;
mov.f32 %f381, %f368;
mov.f32 %f382, %f368;
mov.f32 %f383, %f368;
$L__BB0_50:
mov.u32 %r284, %tid.z;
mad.lo.s32 %r285, %r4, %r284, %r8;
mad.lo.s32 %r39, %r285, %r3, %r6;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
clz.b32 %r286, %r4;
mov.u32 %r287, 31;
sub.s32 %r288, %r287, %r286;
mov.u32 %r289, 1;
shl.b32 %r40, %r289, %r288;
setp.lt.u32 %p42, %r8, %r40;
add.s32 %r290, %r40, %r8;
setp.lt.u32 %p43, %r290, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r291, %r3, %r288;
add.s32 %r292, %r39, %r291;
mul.wide.s32 %rd91, %r292, 4;
add.s64 %rd24, %rd45, %rd91;
shr.u32 %r293, %r40, 31;
add.s32 %r294, %r40, %r293;
shr.s32 %r441, %r294, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f241, [%rd24];
ld.shared.f32 %f242, [%rd23];
add.f32 %f243, %f241, %f242;
st.shared.f32 [%rd23], %f243;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r427, %r441;
$L__BB0_54:
setp.ge.u32 %p46, %r8, %r427;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r295, %r427, %r3, %r39;
mul.wide.s32 %rd92, %r295, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f244, [%rd23];
ld.shared.f32 %f245, [%rd94];
add.f32 %f246, %f245, %f244;
st.shared.f32 [%rd23], %f246;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r427, 1;
setp.gt.u32 %p47, %r427, 3;
mov.u32 %r427, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r297, %r39, %r3;
mul.wide.u32 %rd95, %r297, 4;
add.s64 %rd25, %rd45, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.u32 %r428, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f247, [%rd23];
add.f32 %f384, %f247, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f248, [%rd25];
add.f32 %f384, %f384, %f248;
$L__BB0_60:
mov.b32 %r428, %f384;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f249, [%rd24];
ld.shared.f32 %f250, [%rd23];
add.f32 %f251, %f249, %f250;
st.shared.f32 [%rd23], %f251;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r429, %r441;
$L__BB0_65:
setp.ge.u32 %p52, %r8, %r429;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r298, %r429, %r3, %r39;
mul.wide.s32 %rd97, %r298, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f252, [%rd23];
ld.shared.f32 %f253, [%rd99];
add.f32 %f254, %f253, %f252;
st.shared.f32 [%rd23], %f254;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r429, 1;
setp.gt.u32 %p53, %r429, 3;
mov.u32 %r429, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r430, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f255, [%rd23];
add.f32 %f385, %f255, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f256, [%rd25];
add.f32 %f385, %f385, %f256;
$L__BB0_71:
mov.b32 %r430, %f385;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f382;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f257, [%rd24];
ld.shared.f32 %f258, [%rd23];
add.f32 %f259, %f257, %f258;
st.shared.f32 [%rd23], %f259;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r431, %r441;
$L__BB0_76:
setp.ge.u32 %p58, %r8, %r431;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r300, %r431, %r3, %r39;
mul.wide.s32 %rd100, %r300, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f260, [%rd23];
ld.shared.f32 %f261, [%rd102];
add.f32 %f262, %f261, %f260;
st.shared.f32 [%rd23], %f262;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r431, 1;
setp.gt.u32 %p59, %r431, 3;
mov.u32 %r431, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r432, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f263, [%rd23];
add.f32 %f386, %f263, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f264, [%rd25];
add.f32 %f386, %f386, %f264;
$L__BB0_82:
mov.b32 %r432, %f386;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f383;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f265, [%rd24];
ld.shared.f32 %f266, [%rd23];
add.f32 %f267, %f265, %f266;
st.shared.f32 [%rd23], %f267;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r433, %r441;
$L__BB0_87:
setp.ge.u32 %p64, %r8, %r433;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r302, %r433, %r3, %r39;
mul.wide.s32 %rd103, %r302, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f268, [%rd23];
ld.shared.f32 %f269, [%rd105];
add.f32 %f270, %f269, %f268;
st.shared.f32 [%rd23], %f270;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r433, 1;
setp.gt.u32 %p65, %r433, 3;
mov.u32 %r433, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r434, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f271, [%rd23];
add.f32 %f387, %f271, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f272, [%rd25];
add.f32 %f387, %f387, %f272;
$L__BB0_93:
mov.b32 %r434, %f387;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f273, [%rd24];
ld.shared.f32 %f274, [%rd23];
add.f32 %f275, %f273, %f274;
st.shared.f32 [%rd23], %f275;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r435, %r441;
$L__BB0_98:
setp.ge.u32 %p70, %r8, %r435;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r304, %r435, %r3, %r39;
mul.wide.s32 %rd106, %r304, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f276, [%rd23];
ld.shared.f32 %f277, [%rd108];
add.f32 %f278, %f277, %f276;
st.shared.f32 [%rd23], %f278;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r435, 1;
setp.gt.u32 %p71, %r435, 3;
mov.u32 %r435, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r436, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f279, [%rd23];
add.f32 %f388, %f279, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f280, [%rd25];
add.f32 %f388, %f388, %f280;
$L__BB0_104:
mov.b32 %r436, %f388;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f281, [%rd24];
ld.shared.f32 %f282, [%rd23];
add.f32 %f283, %f281, %f282;
st.shared.f32 [%rd23], %f283;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r437, %r441;
$L__BB0_109:
setp.ge.u32 %p76, %r8, %r437;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r306, %r437, %r3, %r39;
mul.wide.s32 %rd109, %r306, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f284, [%rd23];
ld.shared.f32 %f285, [%rd111];
add.f32 %f286, %f285, %f284;
st.shared.f32 [%rd23], %f286;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r437, 1;
setp.gt.u32 %p77, %r437, 3;
mov.u32 %r437, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r438, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f287, [%rd23];
add.f32 %f389, %f287, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f288, [%rd25];
add.f32 %f389, %f389, %f288;
$L__BB0_115:
mov.b32 %r438, %f389;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f370;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f289, [%rd24];
ld.shared.f32 %f290, [%rd23];
add.f32 %f291, %f289, %f290;
st.shared.f32 [%rd23], %f291;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r439, %r441;
$L__BB0_120:
setp.ge.u32 %p82, %r8, %r439;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r308, %r439, %r3, %r39;
mul.wide.s32 %rd112, %r308, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f292, [%rd23];
ld.shared.f32 %f293, [%rd114];
add.f32 %f294, %f293, %f292;
st.shared.f32 [%rd23], %f294;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r439, 1;
setp.gt.u32 %p83, %r439, 3;
mov.u32 %r439, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r440, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f295, [%rd23];
add.f32 %f390, %f295, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f296, [%rd25];
add.f32 %f390, %f390, %f296;
$L__BB0_126:
mov.b32 %r440, %f390;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f371;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f297, [%rd24];
ld.shared.f32 %f298, [%rd23];
add.f32 %f299, %f297, %f298;
st.shared.f32 [%rd23], %f299;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r8, %r441;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r310, %r441, %r3, %r39;
mul.wide.s32 %rd115, %r310, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f300, [%rd23];
ld.shared.f32 %f301, [%rd117];
add.f32 %f302, %f301, %f300;
st.shared.f32 [%rd23], %f302;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r441, 1;
setp.gt.u32 %p89, %r441, 3;
mov.u32 %r441, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r442, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f303, [%rd23];
add.f32 %f391, %f303, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f304, [%rd25];
add.f32 %f391, %f391, %f304;
$L__BB0_136:
mov.b32 %r442, %f391;
$L__BB0_137:
setp.eq.s32 %p144, %r8, 0;
and.pred %p143, %p144, %p1;
bar.sync 0;
@%p143 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
mov.u32 %r320, %ctaid.y;
mad.lo.s32 %r321, %r176, %r320, %r7;
mul.wide.s32 %rd120, %r321, 4;
add.s64 %rd118, %rd42, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd118], {%r428,%r430,%r432,%r434};
// end inline asm
add.s64 %rd119, %rd43, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r436,%r438,%r440,%r442};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r322, %r6, %r8;
or.b32 %r324, %r322, %r284;
setp.ne.s32 %p92, %r324, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd161, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12];
cvta.to.global.u64 %rd121, %rd161;
mov.u32 %r325, %ctaid.x;
mov.u32 %r326, %ctaid.z;
mov.u32 %r327, %nctaid.x;
mad.lo.s32 %r328, %r326, %r327, %r325;
mul.wide.s32 %rd122, %r328, 8;
add.s64 %rd28, %rd121, %rd122;
add.s32 %r329, %r9, -1;
setp.eq.s32 %p93, %r74, %r329;
cvt.s64.s32 %rd123, %r9;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p94, %rd128, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r443, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r443;
// end inline asm
setp.lt.u32 %p95, %r443, 256;
selp.u32 %r332, 1, 0, %p95;
shl.b32 %r443, %r443, %r332;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p96, %rd130, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r333, %r9, %r3;
add.s32 %r334, %r333, -1;
div.s32 %r77, %r334, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f394, 0f00000000;
mov.f32 %f395, %f394;
@%p97 bra $L__BB0_149;
add.s32 %r336, %r176, 1;
shr.u32 %r337, %r336, 31;
add.s32 %r338, %r336, %r337;
shr.s32 %r339, %r338, 1;
add.s32 %r340, %r4, %r339;
add.s32 %r341, %r340, -1;
shl.b32 %r342, %r8, 1;
shl.b32 %r343, %r4, 1;
mad.lo.s32 %r344, %r343, %r74, %r342;
or.b32 %r345, %r344, 1;
setp.ge.s32 %p98, %r345, %r176;
div.s32 %r346, %r341, %r4;
setp.ge.s32 %p99, %r74, %r346;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r347, %r4, %r74;
shl.b32 %r348, %r347, 1;
mad.lo.s32 %r349, %r176, %r6, %r348;
add.s32 %r445, %r349, %r342;
mul.lo.s32 %r79, %r176, %r3;
mov.u32 %r335, 0;
mov.f32 %f394, 0f00000000;
mov.u32 %r444, %r6;
mov.u32 %r446, %r335;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r447, %r335;
mov.u32 %r448, %r335;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r444, %r9;
mov.u32 %r447, %r335;
mov.u32 %r448, %r335;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd132, %r445, 4;
add.s64 %rd131, %rd42, %rd132;
// begin inline asm
ld.volatile.global.v2.s32 {%r448,%r447}, [%rd131];
// end inline asm
$L__BB0_148:
mov.b32 %f309, %r448;
add.f32 %f394, %f394, %f309;
mov.b32 %f310, %r447;
add.f32 %f395, %f395, %f310;
add.s32 %r445, %r445, %r79;
add.s32 %r444, %r444, %r3;
add.s32 %r446, %r446, 1;
setp.lt.s32 %p101, %r446, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r356, %r3;
mov.u32 %r357, 31;
sub.s32 %r358, %r357, %r356;
mov.u32 %r359, 1;
shl.b32 %r90, %r359, %r358;
setp.lt.u32 %p102, %r6, %r90;
add.s32 %r360, %r90, %r6;
setp.lt.u32 %p103, %r360, %r3;
and.pred %p7, %p102, %p103;
add.s32 %r361, %r39, %r90;
mul.wide.s32 %rd133, %r361, 4;
add.s64 %rd30, %rd45, %rd133;
shr.u32 %r362, %r90, 31;
add.s32 %r363, %r90, %r362;
shr.s32 %r459, %r363, 1;
st.shared.f32 [%rd23], %f394;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f311, [%rd30];
ld.shared.f32 %f312, [%rd23];
add.f32 %f313, %f311, %f312;
st.shared.f32 [%rd23], %f313;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r449, %r459;
$L__BB0_153:
setp.ge.u32 %p106, %r6, %r449;
@%p106 bra $L__BB0_155;
add.s32 %r364, %r449, %r39;
mul.wide.s32 %rd135, %r364, 4;
add.s64 %rd137, %rd45, %rd135;
ld.shared.f32 %f314, [%rd23];
ld.shared.f32 %f315, [%rd137];
add.f32 %f316, %f315, %f314;
st.shared.f32 [%rd23], %f316;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r449, 1;
setp.gt.u32 %p107, %r449, 3;
mov.u32 %r449, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r366, %r39, 1;
mul.wide.u32 %rd138, %r366, 4;
add.s64 %rd31, %rd45, %rd138;
setp.ne.s32 %p108, %r6, 0;
mov.u32 %r450, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f317, [%rd23];
add.f32 %f396, %f317, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f318, [%rd31];
add.f32 %f396, %f396, %f318;
$L__BB0_159:
mov.b32 %r450, %f396;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f395;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f319, [%rd30];
ld.shared.f32 %f320, [%rd23];
add.f32 %f321, %f319, %f320;
st.shared.f32 [%rd23], %f321;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r451, %r459;
$L__BB0_164:
setp.ge.u32 %p112, %r6, %r451;
@%p112 bra $L__BB0_166;
add.s32 %r367, %r451, %r39;
mul.wide.s32 %rd140, %r367, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f322, [%rd23];
ld.shared.f32 %f323, [%rd142];
add.f32 %f324, %f323, %f322;
st.shared.f32 [%rd23], %f324;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r451, 1;
setp.gt.u32 %p113, %r451, 3;
mov.u32 %r451, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r452, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f325, [%rd23];
add.f32 %f397, %f325, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f326, [%rd31];
add.f32 %f397, %f397, %f326;
$L__BB0_170:
mov.b32 %r452, %f397;
$L__BB0_171:
bar.sync 0;
setp.eq.s32 %p116, %r6, 0;
@%p116 bra $L__BB0_172;
bra.uni $L__BB0_175;
$L__BB0_172:
add.s32 %r369, %r176, 1;
shr.u32 %r370, %r369, 31;
add.s32 %r371, %r369, %r370;
shr.s32 %r372, %r371, 1;
add.s32 %r373, %r4, %r372;
add.s32 %r374, %r373, -1;
div.s32 %r375, %r374, %r4;
setp.ge.s32 %p117, %r74, %r375;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r8, 1;
mul.lo.s32 %r376, %r4, %r74;
shl.b32 %r101, %r376, 1;
add.s32 %r377, %r100, %r101;
or.b32 %r378, %r377, 1;
setp.ge.s32 %p118, %r378, %r176;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd160, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
add.s32 %r381, %r101, %r100;
mul.wide.s32 %rd144, %r381, 4;
add.s64 %rd143, %rd160, %rd144;
// begin inline asm
st.global.cs.v2.s32 [%rd143], {%r450,%r452};
// end inline asm
$L__BB0_175:
add.s32 %r382, %r176, 1;
shr.u32 %r383, %r382, 31;
add.s32 %r384, %r382, %r383;
shr.s32 %r385, %r384, 1;
add.s32 %r386, %r4, %r385;
add.s32 %r387, %r386, -1;
div.s32 %r102, %r387, %r4;
setp.ge.s32 %p119, %r74, %r102;
mov.f32 %f398, 0f00000000;
mov.f32 %f402, 0f00000000;
mov.f32 %f399, %f402;
@%p119 bra $L__BB0_178;
shl.b32 %r103, %r8, 1;
mul.lo.s32 %r388, %r4, %r74;
shl.b32 %r104, %r388, 1;
add.s32 %r389, %r103, %r104;
or.b32 %r390, %r389, 1;
setp.ge.s32 %p120, %r390, %r176;
@%p120 bra $L__BB0_178;
ld.param.u64 %rd159, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5];
add.s32 %r393, %r104, %r103;
mul.wide.s32 %rd146, %r393, 4;
add.s64 %rd145, %rd159, %rd146;
// begin inline asm
ld.global.cs.v2.u32 {%r391,%r392}, [%rd145];
// end inline asm
mov.b32 %f398, %r391;
mov.b32 %f399, %r392;
$L__BB0_178:
mov.f32 %f403, %f402;
@%p97 bra $L__BB0_184;
shl.b32 %r395, %r8, 1;
shl.b32 %r396, %r4, 1;
mad.lo.s32 %r397, %r396, %r74, %r395;
or.b32 %r398, %r397, 1;
setp.ge.s32 %p122, %r398, %r176;
or.pred %p8, %p122, %p119;
mul.lo.s32 %r399, %r4, %r74;
shl.b32 %r400, %r399, 1;
mad.lo.s32 %r401, %r176, %r6, %r400;
add.s32 %r454, %r401, %r395;
mul.lo.s32 %r106, %r176, %r3;
mov.u32 %r394, 0;
mov.f32 %f402, 0f00000000;
mov.u32 %r453, %r6;
mov.f32 %f403, %f402;
mov.u32 %r455, %r394;
$L__BB0_180:
.pragma "nounroll";
mov.u32 %r456, %r394;
mov.u32 %r457, %r394;
@%p8 bra $L__BB0_183;
setp.ge.s32 %p124, %r453, %r9;
mov.u32 %r456, %r394;
mov.u32 %r457, %r394;
@%p124 bra $L__BB0_183;
mul.wide.s32 %rd148, %r454, 4;
add.s64 %rd147, %rd43, %rd148;
// begin inline asm
ld.volatile.global.v2.s32 {%r457,%r456}, [%rd147];
// end inline asm
$L__BB0_183:
mov.b32 %f335, %r457;
add.f32 %f402, %f402, %f335;
mov.b32 %f336, %r456;
add.f32 %f403, %f403, %f336;
add.s32 %r454, %r454, %r106;
add.s32 %r453, %r453, %r3;
add.s32 %r455, %r455, 1;
setp.lt.s32 %p125, %r455, %r77;
@%p125 bra $L__BB0_180;
$L__BB0_184:
st.shared.f32 [%rd23], %f402;
bar.sync 0;
@%p104 bra $L__BB0_186;
ld.shared.f32 %f337, [%rd30];
ld.shared.f32 %f338, [%rd23];
add.f32 %f339, %f337, %f338;
st.shared.f32 [%rd23], %f339;
$L__BB0_186:
bar.sync 0;
@%p105 bra $L__BB0_191;
mov.u32 %r458, %r459;
$L__BB0_188:
setp.ge.u32 %p128, %r6, %r458;
@%p128 bra $L__BB0_190;
add.s32 %r408, %r458, %r39;
mul.wide.s32 %rd149, %r408, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f340, [%rd23];
ld.shared.f32 %f341, [%rd151];
add.f32 %f342, %f341, %f340;
st.shared.f32 [%rd23], %f342;
$L__BB0_190:
bar.sync 0;
shr.u32 %r118, %r458, 1;
setp.gt.u32 %p129, %r458, 3;
mov.u32 %r458, %r118;
@%p129 bra $L__BB0_188;
$L__BB0_191:
mov.f32 %f404, 0f00000000;
@%p108 bra $L__BB0_194;
setp.lt.u32 %p131, %r3, 2;
ld.shared.f32 %f344, [%rd23];
add.f32 %f404, %f344, 0f00000000;
@%p131 bra $L__BB0_194;
ld.shared.f32 %f345, [%rd31];
add.f32 %f404, %f404, %f345;
$L__BB0_194:
bar.sync 0;
st.shared.f32 [%rd23], %f403;
bar.sync 0;
@%p104 bra $L__BB0_196;
ld.shared.f32 %f346, [%rd30];
ld.shared.f32 %f347, [%rd23];
add.f32 %f348, %f346, %f347;
st.shared.f32 [%rd23], %f348;
$L__BB0_196:
bar.sync 0;
@%p105 bra $L__BB0_200;
$L__BB0_197:
setp.ge.u32 %p134, %r6, %r459;
@%p134 bra $L__BB0_199;
add.s32 %r409, %r459, %r39;
mul.wide.s32 %rd152, %r409, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f349, [%rd23];
ld.shared.f32 %f350, [%rd154];
add.f32 %f351, %f350, %f349;
st.shared.f32 [%rd23], %f351;
$L__BB0_199:
bar.sync 0;
shr.u32 %r120, %r459, 1;
setp.gt.u32 %p135, %r459, 3;
mov.u32 %r459, %r120;
@%p135 bra $L__BB0_197;
$L__BB0_200:
add.f32 %f96, %f404, %f398;
mov.f32 %f405, 0f00000000;
@%p108 bra $L__BB0_203;
setp.lt.u32 %p137, %r3, 2;
ld.shared.f32 %f353, [%rd23];
add.f32 %f405, %f353, 0f00000000;
@%p137 bra $L__BB0_203;
ld.shared.f32 %f354, [%rd31];
add.f32 %f405, %f405, %f354;
$L__BB0_203:
bar.sync 0;
or.pred %p140, %p108, %p119;
@%p140 bra $L__BB0_206;
shl.b32 %r121, %r8, 1;
mul.lo.s32 %r410, %r4, %r74;
shl.b32 %r122, %r410, 1;
add.s32 %r411, %r121, %r122;
or.b32 %r412, %r411, 1;
setp.ge.s32 %p141, %r412, %r176;
@%p141 bra $L__BB0_206;
ld.param.u64 %rd162, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd158, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_60_cu_f233a6ae_723310nvfuser_60ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r417, %r122, %r121;
mul.wide.s32 %rd157, %r417, 4;
add.s64 %rd155, %rd158, %rd157;
mov.b32 %r414, %f405;
mov.b32 %r413, %f404;
// begin inline asm
st.global.cs.v2.s32 [%rd155], {%r413,%r414};
// end inline asm
add.s64 %rd156, %rd162, %rd157;
add.f32 %f355, %f405, %f399;
mov.b32 %r416, %f355;
mov.b32 %r415, %f96;
// begin inline asm
st.global.cs.v2.s32 [%rd156], {%r415,%r416};
// end inline asm
$L__BB0_206:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -31,11 +31,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12[16]
)
{
.reg .pred %p<146>;
.reg .f32 %f<406>;
- .reg .b32 %r<462>;
+ .reg .b32 %r<460>;
.reg .f64 %fd<3>;
.reg .b64 %rd<163>;
ld.param.v2.u32 {%r169, %r170}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
@@ -51,119 +51,119 @@
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r203, %r176, 3;
shr.s32 %r204, %r203, 31;
shr.u32 %r205, %r204, 30;
add.s32 %r206, %r203, %r205;
- shr.s32 %r207, %r206, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r208, %r207, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r209, %r3, 2;
- mad.lo.s32 %r210, %r209, %r208, 15;
- and.b32 %r211, %r210, -16;
- cvt.u64.u32 %rd1, %r211;
- mul.lo.s32 %r212, %r3, %r207;
- shl.b32 %r213, %r212, 4;
- or.b32 %r214, %r213, 15;
- and.b32 %r4, %r214, -16;
- add.s32 %r215, %r214, %r4;
- and.b32 %r216, %r215, -16;
- cvt.s64.s32 %rd2, %r216;
+ shr.s32 %r2, %r206, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r207, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r208, %r4, 2;
+ mad.lo.s32 %r209, %r208, %r207, 15;
+ and.b32 %r210, %r209, -16;
+ cvt.u64.u32 %rd1, %r210;
+ mul.lo.s32 %r211, %r4, %r2;
+ shl.b32 %r212, %r211, 4;
+ or.b32 %r213, %r212, 15;
+ and.b32 %r5, %r213, -16;
+ add.s32 %r214, %r213, %r5;
+ and.b32 %r215, %r214, -16;
+ cvt.s64.s32 %rd2, %r215;
mov.u64 %rd45, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p9, %r5, %r207;
- shl.b32 %r6, %r5, 2;
- or.b32 %r217, %r6, 3;
- setp.lt.s32 %p10, %r217, %r176;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p9, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r216, %r7, 3;
+ setp.lt.s32 %p10, %r216, %r176;
and.pred %p1, %p10, %p9;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p11, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r218, smem_ptr; }
-
-
- shl.b32 %r221, %r5, 4;
- add.s32 %r219, %r218, %r221;
- mul.wide.s32 %rd49, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r217, smem_ptr; }
+
+
+ shl.b32 %r220, %r6, 4;
+ add.s32 %r218, %r217, %r220;
+ mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd36, %rd49;
- mov.u32 %r220, 0;
+ mov.u32 %r219, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r220, 0;
- cp.async.ca.shared.global [%r219], [%rd48], 16, p0;
+ setp.ne.b32 p0, %r219, 0;
+ cp.async.ca.shared.global [%r218], [%rd48], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r222, %r3, 63;
- div.s32 %r223, %r222, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r224, %r8, %r223;
- add.s32 %r225, %r224, -1;
- div.s32 %r9, %r225, %r8;
- setp.gt.s32 %p13, %r9, 0;
+ add.s32 %r221, %r4, 63;
+ div.s32 %r222, %r221, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r223, %r9, %r222;
+ add.s32 %r224, %r223, -1;
+ div.s32 %r10, %r224, %r9;
+ setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r176;
- cvt.s64.s32 %rd50, %r4;
+ cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
- mov.u32 %r227, %ctaid.y;
- mul.lo.s32 %r228, %r9, %r3;
- mul.lo.s32 %r10, %r228, %r227;
- shl.b32 %r229, %r7, 2;
- shl.b32 %r230, %r5, 4;
- mad.lo.s32 %r11, %r229, %r176, %r230;
- mul.lo.s32 %r231, %r176, %r7;
- cvt.s64.s32 %rd54, %r231;
- cvt.s64.s32 %rd55, %r6;
+ mov.u32 %r226, %ctaid.y;
+ mul.lo.s32 %r227, %r10, %r4;
+ mul.lo.s32 %r11, %r227, %r226;
+ mad.lo.s32 %r228, %r2, %r8, %r6;
+ shl.b32 %r12, %r228, 4;
+ mul.lo.s32 %r229, %r176, %r8;
+ cvt.s64.s32 %rd54, %r229;
+ cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
- mul.lo.s32 %r232, %r10, %r176;
- cvt.s64.s32 %rd6, %r232;
- mul.lo.s32 %r12, %r176, %r3;
- mul.lo.s32 %r13, %r9, %r227;
- add.s32 %r14, %r231, %r6;
+ mul.lo.s32 %r230, %r11, %r176;
+ cvt.s64.s32 %rd6, %r230;
+ mul.lo.s32 %r13, %r176, %r4;
+ mul.lo.s32 %r14, %r10, %r226;
+ shl.b32 %r231, %r8, 2;
+ mad.lo.s32 %r232, %r231, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
- mul.wide.s32 %rd57, %r14, 4;
+ mul.wide.s32 %rd57, %r232, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r233, %tid.z;
- mad.lo.s32 %r234, %r3, %r233, %r7;
- mad.lo.s32 %r15, %r234, %r2, %r5;
+ mad.lo.s32 %r234, %r4, %r233, %r8;
+ mad.lo.s32 %r15, %r234, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
- clz.b32 %r235, %r2;
+ clz.b32 %r235, %r3;
mov.u32 %r236, 31;
sub.s32 %r237, %r236, %r235;
mov.u32 %r238, 1;
shl.b32 %r16, %r238, %r237;
- setp.lt.u32 %p14, %r5, %r16;
- add.s32 %r239, %r16, %r5;
- setp.lt.u32 %p15, %r239, %r2;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r239, %r16, %r6;
+ setp.lt.u32 %p15, %r239, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r240, %r15, %r16;
mul.wide.s32 %rd59, %r240, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r241, %r16, 31;
add.s32 %r242, %r16, %r241;
shr.s32 %r17, %r242, 1;
add.s64 %rd60, %rd45, %rd4;
- mul.wide.s32 %rd61, %r6, 4;
+ mul.wide.s32 %rd61, %r7, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r243, %r15, 1;
mul.wide.u32 %rd62, %r243, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
@@ -171,23 +171,23 @@
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd33;
cvta.to.global.u64 %rd16, %rd34;
add.s64 %rd19, %rd46, %rd51;
- mov.u32 %r422, 0;
+ mov.u32 %r420, 0;
mov.f32 %f368, 0f00000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r246, smem_ptr; }
- add.s32 %r247, %r11, %r246;
+ add.s32 %r247, %r246, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r272, smem_ptr; }
- add.s32 %r273, %r11, %r272;
+ add.s32 %r273, %r272, %r12;
not.pred %p26, %p3;
mov.f32 %f369, %f368;
mov.f32 %f370, %f368;
mov.f32 %f371, %f368;
mov.f32 %f380, %f368;
@@ -197,16 +197,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r244, %r422, %r3, %r7;
- add.s32 %r245, %r244, %r10;
+ mad.lo.s32 %r244, %r420, %r4, %r8;
+ add.s32 %r245, %r244, %r11;
setp.gt.s32 %p17, %r245, 63;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r249, %r12, %r422;
+ mul.lo.s32 %r249, %r13, %r420;
cvt.s64.s32 %rd67, %r249;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd35, %rd70;
@@ -225,53 +225,53 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r250, %r13, %r422;
- mad.lo.s32 %r251, %r250, %r3, %r7;
+ add.s32 %r250, %r14, %r420;
+ mad.lo.s32 %r251, %r250, %r4, %r8;
setp.lt.s32 %p19, %r251, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
+ ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r260, %r13, %r422;
- mad.lo.s32 %r261, %r260, %r3, %r7;
+ add.s32 %r260, %r14, %r420;
+ mad.lo.s32 %r261, %r260, %r4, %r8;
setp.gt.s32 %p20, %r261, 63;
- mov.u32 %r423, 0;
- mov.u32 %r424, %r423;
- mov.u32 %r425, %r423;
- mov.u32 %r426, %r423;
+ mov.u32 %r421, 0;
+ mov.u32 %r422, %r421;
+ mov.u32 %r423, %r421;
+ mov.u32 %r424, %r421;
@%p20 bra $L__BB0_15;
- ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
+ ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r423, 0;
- mov.u32 %r424, %r423;
- mov.u32 %r425, %r423;
- mov.u32 %r426, %r423;
+ mov.u32 %r421, 0;
+ mov.u32 %r422, %r421;
+ mov.u32 %r423, %r421;
+ mov.u32 %r424, %r421;
$L__BB0_15:
- add.s32 %r270, %r13, %r422;
- mad.lo.s32 %r33, %r270, %r3, %r7;
- mov.b32 %f117, %r426;
+ add.s32 %r270, %r14, %r420;
+ mad.lo.s32 %r33, %r270, %r4, %r8;
+ mov.b32 %f117, %r424;
add.f32 %f383, %f383, %f117;
- mov.b32 %f118, %r425;
+ mov.b32 %f118, %r423;
add.f32 %f382, %f382, %f118;
- mov.b32 %f119, %r424;
+ mov.b32 %f119, %r422;
add.f32 %f381, %f381, %f119;
- mov.b32 %f120, %r423;
+ mov.b32 %f120, %r421;
add.f32 %f380, %f380, %f120;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f366, 0f00000000;
@%p21 bra $L__BB0_17;
@@ -284,11 +284,11 @@
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
- mul.lo.s32 %r275, %r12, %r422;
+ mul.lo.s32 %r275, %r13, %r420;
cvt.s64.s32 %rd75, %r275;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd32, %rd78;
@@ -301,13 +301,13 @@
}
$L__BB0_19:
- add.s32 %r421, %r13, %r422;
- mad.lo.s32 %r420, %r421, %r3, %r7;
- setp.gt.s32 %p145, %r420, 63;
+ add.s32 %r419, %r14, %r420;
+ mad.lo.s32 %r418, %r419, %r4, %r8;
+ setp.gt.s32 %p145, %r418, 63;
mov.f32 %f372, 0f00000000;
mov.f32 %f367, %f372;
@%p145 bra $L__BB0_21;
mul.lo.s32 %r276, %r33, %r173;
@@ -364,37 +364,37 @@
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
- mov.u32 %r427, %r17;
+ mov.u32 %r425, %r17;
$L__BB0_27:
- setp.ge.u32 %p28, %r5, %r427;
+ setp.ge.u32 %p28, %r6, %r425;
@%p28 bra $L__BB0_29;
- add.s32 %r277, %r427, %r15;
+ add.s32 %r277, %r425, %r15;
mul.wide.s32 %rd81, %r277, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f169, [%rd8];
ld.shared.f32 %f170, [%rd83];
add.f32 %f171, %f170, %f169;
st.shared.f32 [%rd8], %f171;
$L__BB0_29:
bar.sync 0;
- shr.u32 %r35, %r427, 1;
- setp.gt.u32 %p29, %r427, 3;
- mov.u32 %r427, %r35;
+ shr.u32 %r35, %r425, 1;
+ setp.gt.u32 %p29, %r425, 3;
+ mov.u32 %r425, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p30, %r5, 0;
+ setp.ne.s32 %p30, %r6, 0;
mov.f32 %f374, 0f00000000;
@%p30 bra $L__BB0_33;
- setp.lt.u32 %p31, %r2, 2;
+ setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f173, [%rd8];
add.f32 %f374, %f173, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f174, [%rd11];
@@ -414,36 +414,36 @@
$L__BB0_35:
setp.lt.s32 %p142, %r16, 4;
bar.sync 0;
@%p142 bra $L__BB0_40;
- mov.u32 %r428, %r17;
+ mov.u32 %r426, %r17;
$L__BB0_37:
- setp.ge.u32 %p34, %r5, %r428;
+ setp.ge.u32 %p34, %r6, %r426;
@%p34 bra $L__BB0_39;
- add.s32 %r278, %r428, %r15;
+ add.s32 %r278, %r426, %r15;
mul.wide.s32 %rd84, %r278, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f178, [%rd8];
ld.shared.f32 %f179, [%rd86];
add.f32 %f180, %f179, %f178;
st.shared.f32 [%rd8], %f180;
$L__BB0_39:
bar.sync 0;
- shr.u32 %r37, %r428, 1;
- setp.gt.u32 %p35, %r428, 3;
- mov.u32 %r428, %r37;
+ shr.u32 %r37, %r426, 1;
+ setp.gt.u32 %p35, %r426, 3;
+ mov.u32 %r426, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f375, 0f00000000;
@%p30 bra $L__BB0_43;
- setp.lt.u32 %p37, %r2, 2;
+ setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f182, [%rd8];
add.f32 %f375, %f182, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f183, [%rd11];
@@ -507,21 +507,20 @@
sub.f32 %f237, %f233, %f37;
mul.f32 %f238, %f38, %f236;
sub.f32 %f239, %f237, %f238;
mul.f32 %f240, %f184, %f239;
mov.b32 %r282, %f240;
- mad.lo.s32 %r283, %r422, %r3, %r10;
- mad.lo.s32 %r284, %r283, %r176, %r14;
- mul.wide.s32 %rd88, %r284, 4;
+ mad.lo.s32 %r283, %r33, %r176, %r7;
+ mul.wide.s32 %rd88, %r283, 4;
add.s64 %rd87, %rd40, %rd88;
st.global.cs.v4.s32 [%rd87], {%r279,%r280,%r281,%r282};
$L__BB0_49:
- add.s32 %r422, %r422, 1;
- setp.lt.s32 %p41, %r422, %r9;
+ add.s32 %r420, %r420, 1;
+ setp.lt.s32 %p41, %r420, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f368, 0f00000000;
@@ -532,31 +531,31 @@
mov.f32 %f381, %f368;
mov.f32 %f382, %f368;
mov.f32 %f383, %f368;
$L__BB0_50:
- mov.u32 %r285, %tid.z;
- mad.lo.s32 %r286, %r3, %r285, %r7;
- mad.lo.s32 %r39, %r286, %r2, %r5;
+ mov.u32 %r284, %tid.z;
+ mad.lo.s32 %r285, %r4, %r284, %r8;
+ mad.lo.s32 %r39, %r285, %r3, %r6;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
- clz.b32 %r287, %r3;
- mov.u32 %r288, 31;
- sub.s32 %r289, %r288, %r287;
- mov.u32 %r290, 1;
- shl.b32 %r40, %r290, %r289;
- setp.lt.u32 %p42, %r7, %r40;
- add.s32 %r291, %r40, %r7;
- setp.lt.u32 %p43, %r291, %r3;
+ clz.b32 %r286, %r4;
+ mov.u32 %r287, 31;
+ sub.s32 %r288, %r287, %r286;
+ mov.u32 %r289, 1;
+ shl.b32 %r40, %r289, %r288;
+ setp.lt.u32 %p42, %r8, %r40;
+ add.s32 %r290, %r40, %r8;
+ setp.lt.u32 %p43, %r290, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r292, %r2, %r289;
- add.s32 %r293, %r39, %r292;
- mul.wide.s32 %rd91, %r293, 4;
+ shl.b32 %r291, %r3, %r288;
+ add.s32 %r292, %r39, %r291;
+ mul.wide.s32 %rd91, %r292, 4;
add.s64 %rd24, %rd45, %rd91;
- shr.u32 %r294, %r40, 31;
- add.s32 %r295, %r40, %r294;
- shr.s32 %r443, %r295, 1;
+ shr.u32 %r293, %r40, 31;
+ add.s32 %r294, %r40, %r293;
+ shr.s32 %r441, %r294, 1;
st.shared.f32 [%rd23], %f380;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
@@ -568,49 +567,49 @@
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
- mov.u32 %r429, %r443;
+ mov.u32 %r427, %r441;
$L__BB0_54:
- setp.ge.u32 %p46, %r7, %r429;
+ setp.ge.u32 %p46, %r8, %r427;
@%p46 bra $L__BB0_56;
- mad.lo.s32 %r296, %r429, %r2, %r39;
- mul.wide.s32 %rd92, %r296, 4;
+ mad.lo.s32 %r295, %r427, %r3, %r39;
+ mul.wide.s32 %rd92, %r295, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f244, [%rd23];
ld.shared.f32 %f245, [%rd94];
add.f32 %f246, %f245, %f244;
st.shared.f32 [%rd23], %f246;
$L__BB0_56:
bar.sync 0;
- shr.u32 %r43, %r429, 1;
- setp.gt.u32 %p47, %r429, 3;
- mov.u32 %r429, %r43;
+ shr.u32 %r43, %r427, 1;
+ setp.gt.u32 %p47, %r427, 3;
+ mov.u32 %r427, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
- add.s32 %r298, %r39, %r2;
- mul.wide.u32 %rd95, %r298, 4;
+ add.s32 %r297, %r39, %r3;
+ mul.wide.u32 %rd95, %r297, 4;
add.s64 %rd25, %rd45, %rd95;
- setp.ne.s32 %p48, %r7, 0;
- mov.u32 %r430, 0;
+ setp.ne.s32 %p48, %r8, 0;
+ mov.u32 %r428, 0;
@%p48 bra $L__BB0_61;
- setp.lt.u32 %p49, %r3, 2;
+ setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f247, [%rd23];
add.f32 %f384, %f247, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f248, [%rd25];
add.f32 %f384, %f384, %f248;
$L__BB0_60:
- mov.b32 %r430, %f384;
+ mov.b32 %r428, %f384;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f381;
bar.sync 0;
@@ -623,45 +622,45 @@
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
- mov.u32 %r431, %r443;
+ mov.u32 %r429, %r441;
$L__BB0_65:
- setp.ge.u32 %p52, %r7, %r431;
+ setp.ge.u32 %p52, %r8, %r429;
@%p52 bra $L__BB0_67;
- mad.lo.s32 %r299, %r431, %r2, %r39;
- mul.wide.s32 %rd97, %r299, 4;
+ mad.lo.s32 %r298, %r429, %r3, %r39;
+ mul.wide.s32 %rd97, %r298, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f252, [%rd23];
ld.shared.f32 %f253, [%rd99];
add.f32 %f254, %f253, %f252;
st.shared.f32 [%rd23], %f254;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r47, %r431, 1;
- setp.gt.u32 %p53, %r431, 3;
- mov.u32 %r431, %r47;
+ shr.u32 %r47, %r429, 1;
+ setp.gt.u32 %p53, %r429, 3;
+ mov.u32 %r429, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
- mov.u32 %r432, 0;
+ mov.u32 %r430, 0;
@%p48 bra $L__BB0_72;
- setp.lt.u32 %p55, %r3, 2;
+ setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f255, [%rd23];
add.f32 %f385, %f255, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f256, [%rd25];
add.f32 %f385, %f385, %f256;
$L__BB0_71:
- mov.b32 %r432, %f385;
+ mov.b32 %r430, %f385;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f382;
bar.sync 0;
@@ -674,45 +673,45 @@
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
- mov.u32 %r433, %r443;
+ mov.u32 %r431, %r441;
$L__BB0_76:
- setp.ge.u32 %p58, %r7, %r433;
+ setp.ge.u32 %p58, %r8, %r431;
@%p58 bra $L__BB0_78;
- mad.lo.s32 %r301, %r433, %r2, %r39;
- mul.wide.s32 %rd100, %r301, 4;
+ mad.lo.s32 %r300, %r431, %r3, %r39;
+ mul.wide.s32 %rd100, %r300, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f260, [%rd23];
ld.shared.f32 %f261, [%rd102];
add.f32 %f262, %f261, %f260;
st.shared.f32 [%rd23], %f262;
$L__BB0_78:
bar.sync 0;
- shr.u32 %r51, %r433, 1;
- setp.gt.u32 %p59, %r433, 3;
- mov.u32 %r433, %r51;
+ shr.u32 %r51, %r431, 1;
+ setp.gt.u32 %p59, %r431, 3;
+ mov.u32 %r431, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
- mov.u32 %r434, 0;
+ mov.u32 %r432, 0;
@%p48 bra $L__BB0_83;
- setp.lt.u32 %p61, %r3, 2;
+ setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f263, [%rd23];
add.f32 %f386, %f263, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f264, [%rd25];
add.f32 %f386, %f386, %f264;
$L__BB0_82:
- mov.b32 %r434, %f386;
+ mov.b32 %r432, %f386;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f383;
bar.sync 0;
@@ -725,45 +724,45 @@
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
- mov.u32 %r435, %r443;
+ mov.u32 %r433, %r441;
$L__BB0_87:
- setp.ge.u32 %p64, %r7, %r435;
+ setp.ge.u32 %p64, %r8, %r433;
@%p64 bra $L__BB0_89;
- mad.lo.s32 %r303, %r435, %r2, %r39;
- mul.wide.s32 %rd103, %r303, 4;
+ mad.lo.s32 %r302, %r433, %r3, %r39;
+ mul.wide.s32 %rd103, %r302, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f268, [%rd23];
ld.shared.f32 %f269, [%rd105];
add.f32 %f270, %f269, %f268;
st.shared.f32 [%rd23], %f270;
$L__BB0_89:
bar.sync 0;
- shr.u32 %r55, %r435, 1;
- setp.gt.u32 %p65, %r435, 3;
- mov.u32 %r435, %r55;
+ shr.u32 %r55, %r433, 1;
+ setp.gt.u32 %p65, %r433, 3;
+ mov.u32 %r433, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
- mov.u32 %r436, 0;
+ mov.u32 %r434, 0;
@%p48 bra $L__BB0_94;
- setp.lt.u32 %p67, %r3, 2;
+ setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f271, [%rd23];
add.f32 %f387, %f271, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f272, [%rd25];
add.f32 %f387, %f387, %f272;
$L__BB0_93:
- mov.b32 %r436, %f387;
+ mov.b32 %r434, %f387;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f368;
bar.sync 0;
@@ -776,45 +775,45 @@
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
- mov.u32 %r437, %r443;
+ mov.u32 %r435, %r441;
$L__BB0_98:
- setp.ge.u32 %p70, %r7, %r437;
+ setp.ge.u32 %p70, %r8, %r435;
@%p70 bra $L__BB0_100;
- mad.lo.s32 %r305, %r437, %r2, %r39;
- mul.wide.s32 %rd106, %r305, 4;
+ mad.lo.s32 %r304, %r435, %r3, %r39;
+ mul.wide.s32 %rd106, %r304, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f276, [%rd23];
ld.shared.f32 %f277, [%rd108];
add.f32 %f278, %f277, %f276;
st.shared.f32 [%rd23], %f278;
$L__BB0_100:
bar.sync 0;
- shr.u32 %r59, %r437, 1;
- setp.gt.u32 %p71, %r437, 3;
- mov.u32 %r437, %r59;
+ shr.u32 %r59, %r435, 1;
+ setp.gt.u32 %p71, %r435, 3;
+ mov.u32 %r435, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
- mov.u32 %r438, 0;
+ mov.u32 %r436, 0;
@%p48 bra $L__BB0_105;
- setp.lt.u32 %p73, %r3, 2;
+ setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f279, [%rd23];
add.f32 %f388, %f279, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f280, [%rd25];
add.f32 %f388, %f388, %f280;
$L__BB0_104:
- mov.b32 %r438, %f388;
+ mov.b32 %r436, %f388;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f369;
bar.sync 0;
@@ -827,45 +826,45 @@
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
- mov.u32 %r439, %r443;
+ mov.u32 %r437, %r441;
$L__BB0_109:
- setp.ge.u32 %p76, %r7, %r439;
+ setp.ge.u32 %p76, %r8, %r437;
@%p76 bra $L__BB0_111;
- mad.lo.s32 %r307, %r439, %r2, %r39;
- mul.wide.s32 %rd109, %r307, 4;
+ mad.lo.s32 %r306, %r437, %r3, %r39;
+ mul.wide.s32 %rd109, %r306, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f284, [%rd23];
ld.shared.f32 %f285, [%rd111];
add.f32 %f286, %f285, %f284;
st.shared.f32 [%rd23], %f286;
$L__BB0_111:
bar.sync 0;
- shr.u32 %r63, %r439, 1;
- setp.gt.u32 %p77, %r439, 3;
- mov.u32 %r439, %r63;
+ shr.u32 %r63, %r437, 1;
+ setp.gt.u32 %p77, %r437, 3;
+ mov.u32 %r437, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
- mov.u32 %r440, 0;
+ mov.u32 %r438, 0;
@%p48 bra $L__BB0_116;
- setp.lt.u32 %p79, %r3, 2;
+ setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f287, [%rd23];
add.f32 %f389, %f287, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f288, [%rd25];
add.f32 %f389, %f389, %f288;
$L__BB0_115:
- mov.b32 %r440, %f389;
+ mov.b32 %r438, %f389;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f370;
bar.sync 0;
@@ -878,45 +877,45 @@
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
- mov.u32 %r441, %r443;
+ mov.u32 %r439, %r441;
$L__BB0_120:
- setp.ge.u32 %p82, %r7, %r441;
+ setp.ge.u32 %p82, %r8, %r439;
@%p82 bra $L__BB0_122;
- mad.lo.s32 %r309, %r441, %r2, %r39;
- mul.wide.s32 %rd112, %r309, 4;
+ mad.lo.s32 %r308, %r439, %r3, %r39;
+ mul.wide.s32 %rd112, %r308, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f292, [%rd23];
ld.shared.f32 %f293, [%rd114];
add.f32 %f294, %f293, %f292;
st.shared.f32 [%rd23], %f294;
$L__BB0_122:
bar.sync 0;
- shr.u32 %r67, %r441, 1;
- setp.gt.u32 %p83, %r441, 3;
- mov.u32 %r441, %r67;
+ shr.u32 %r67, %r439, 1;
+ setp.gt.u32 %p83, %r439, 3;
+ mov.u32 %r439, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
- mov.u32 %r442, 0;
+ mov.u32 %r440, 0;
@%p48 bra $L__BB0_127;
- setp.lt.u32 %p85, %r3, 2;
+ setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f295, [%rd23];
add.f32 %f390, %f295, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f296, [%rd25];
add.f32 %f390, %f390, %f296;
$L__BB0_126:
- mov.b32 %r442, %f390;
+ mov.b32 %r440, %f390;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f371;
bar.sync 0;
@@ -930,185 +929,184 @@
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
- setp.ge.u32 %p88, %r7, %r443;
+ setp.ge.u32 %p88, %r8, %r441;
@%p88 bra $L__BB0_132;
- mad.lo.s32 %r311, %r443, %r2, %r39;
- mul.wide.s32 %rd115, %r311, 4;
+ mad.lo.s32 %r310, %r441, %r3, %r39;
+ mul.wide.s32 %rd115, %r310, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f300, [%rd23];
ld.shared.f32 %f301, [%rd117];
add.f32 %f302, %f301, %f300;
st.shared.f32 [%rd23], %f302;
$L__BB0_132:
bar.sync 0;
- shr.u32 %r71, %r443, 1;
- setp.gt.u32 %p89, %r443, 3;
- mov.u32 %r443, %r71;
+ shr.u32 %r71, %r441, 1;
+ setp.gt.u32 %p89, %r441, 3;
+ mov.u32 %r441, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
- mov.u32 %r444, 0;
+ mov.u32 %r442, 0;
@%p48 bra $L__BB0_137;
- setp.lt.u32 %p91, %r3, 2;
+ setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f303, [%rd23];
add.f32 %f391, %f303, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f304, [%rd25];
add.f32 %f391, %f391, %f304;
$L__BB0_136:
- mov.b32 %r444, %f391;
+ mov.b32 %r442, %f391;
$L__BB0_137:
- setp.eq.s32 %p144, %r7, 0;
+ setp.eq.s32 %p144, %r8, 0;
and.pred %p143, %p144, %p1;
bar.sync 0;
@%p143 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
- shl.b32 %r419, %r5, 2;
- mov.u32 %r321, %ctaid.y;
- mad.lo.s32 %r322, %r176, %r321, %r419;
- mul.wide.s32 %rd120, %r322, 4;
+ mov.u32 %r320, %ctaid.y;
+ mad.lo.s32 %r321, %r176, %r320, %r7;
+ mul.wide.s32 %rd120, %r321, 4;
add.s64 %rd118, %rd42, %rd120;
- st.volatile.global.v4.s32 [%rd118], {%r430,%r432,%r434,%r436};
+ st.volatile.global.v4.s32 [%rd118], {%r428,%r430,%r432,%r434};
add.s64 %rd119, %rd43, %rd120;
- st.volatile.global.v4.s32 [%rd119], {%r438,%r440,%r442,%r444};
+ st.volatile.global.v4.s32 [%rd119], {%r436,%r438,%r440,%r442};
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r323, %r5, %r7;
- or.b32 %r325, %r323, %r285;
- setp.ne.s32 %p92, %r325, 0;
+ or.b32 %r322, %r6, %r8;
+ or.b32 %r324, %r322, %r284;
+ setp.ne.s32 %p92, %r324, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd161, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12];
cvta.to.global.u64 %rd121, %rd161;
- mov.u32 %r326, %ctaid.x;
- mov.u32 %r327, %ctaid.z;
- mov.u32 %r328, %nctaid.x;
- mad.lo.s32 %r329, %r327, %r328, %r326;
- mul.wide.s32 %rd122, %r329, 8;
+ mov.u32 %r325, %ctaid.x;
+ mov.u32 %r326, %ctaid.z;
+ mov.u32 %r327, %nctaid.x;
+ mad.lo.s32 %r328, %r326, %r327, %r325;
+ mul.wide.s32 %rd122, %r328, 8;
add.s64 %rd28, %rd121, %rd122;
- add.s32 %r330, %r8, -1;
- setp.eq.s32 %p93, %r74, %r330;
- cvt.s64.s32 %rd123, %r8;
+ add.s32 %r329, %r9, -1;
+ setp.eq.s32 %p93, %r74, %r329;
+ cvt.s64.s32 %rd123, %r9;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p94, %rd128, 0;
@%p94 bra $L__BB0_143;
- mov.u32 %r445, 8;
+ mov.u32 %r443, 8;
$L__BB0_142:
- nanosleep.u32 %r445;
-
- setp.lt.u32 %p95, %r445, 256;
- selp.u32 %r333, 1, 0, %p95;
- shl.b32 %r445, %r445, %r333;
+ nanosleep.u32 %r443;
+
+ setp.lt.u32 %p95, %r443, 256;
+ selp.u32 %r332, 1, 0, %p95;
+ shl.b32 %r443, %r443, %r332;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p96, %rd130, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
- add.s32 %r334, %r8, %r2;
- add.s32 %r335, %r334, -1;
- div.s32 %r77, %r335, %r2;
+ add.s32 %r333, %r9, %r3;
+ add.s32 %r334, %r333, -1;
+ div.s32 %r77, %r334, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f394, 0f00000000;
mov.f32 %f395, %f394;
@%p97 bra $L__BB0_149;
- add.s32 %r337, %r176, 1;
- shr.u32 %r338, %r337, 31;
- add.s32 %r339, %r337, %r338;
- shr.s32 %r340, %r339, 1;
- add.s32 %r341, %r3, %r340;
- add.s32 %r342, %r341, -1;
- shl.b32 %r343, %r7, 1;
- shl.b32 %r344, %r3, 1;
- mad.lo.s32 %r345, %r344, %r74, %r343;
- or.b32 %r346, %r345, 1;
- setp.ge.s32 %p98, %r346, %r176;
- div.s32 %r347, %r342, %r3;
- setp.ge.s32 %p99, %r74, %r347;
+ add.s32 %r336, %r176, 1;
+ shr.u32 %r337, %r336, 31;
+ add.s32 %r338, %r336, %r337;
+ shr.s32 %r339, %r338, 1;
+ add.s32 %r340, %r4, %r339;
+ add.s32 %r341, %r340, -1;
+ shl.b32 %r342, %r8, 1;
+ shl.b32 %r343, %r4, 1;
+ mad.lo.s32 %r344, %r343, %r74, %r342;
+ or.b32 %r345, %r344, 1;
+ setp.ge.s32 %p98, %r345, %r176;
+ div.s32 %r346, %r341, %r4;
+ setp.ge.s32 %p99, %r74, %r346;
or.pred %p6, %p99, %p98;
- mul.lo.s32 %r348, %r3, %r74;
- shl.b32 %r349, %r348, 1;
- mad.lo.s32 %r350, %r176, %r5, %r349;
- add.s32 %r447, %r350, %r343;
- mul.lo.s32 %r79, %r176, %r2;
- mov.u32 %r336, 0;
+ mul.lo.s32 %r347, %r4, %r74;
+ shl.b32 %r348, %r347, 1;
+ mad.lo.s32 %r349, %r176, %r6, %r348;
+ add.s32 %r445, %r349, %r342;
+ mul.lo.s32 %r79, %r176, %r3;
+ mov.u32 %r335, 0;
mov.f32 %f394, 0f00000000;
- mov.u32 %r446, %r5;
- mov.u32 %r448, %r336;
+ mov.u32 %r444, %r6;
+ mov.u32 %r446, %r335;
$L__BB0_145:
.pragma "nounroll";
- mov.u32 %r449, %r336;
- mov.u32 %r450, %r336;
+ mov.u32 %r447, %r335;
+ mov.u32 %r448, %r335;
@%p6 bra $L__BB0_148;
- setp.ge.s32 %p100, %r446, %r8;
- mov.u32 %r449, %r336;
- mov.u32 %r450, %r336;
+ setp.ge.s32 %p100, %r444, %r9;
+ mov.u32 %r447, %r335;
+ mov.u32 %r448, %r335;
@%p100 bra $L__BB0_148;
- mul.wide.s32 %rd132, %r447, 4;
+ mul.wide.s32 %rd132, %r445, 4;
add.s64 %rd131, %rd42, %rd132;
- ld.volatile.global.v2.s32 {%r450,%r449}, [%rd131];
+ ld.volatile.global.v2.s32 {%r448,%r447}, [%rd131];
$L__BB0_148:
- mov.b32 %f309, %r450;
+ mov.b32 %f309, %r448;
add.f32 %f394, %f394, %f309;
- mov.b32 %f310, %r449;
+ mov.b32 %f310, %r447;
add.f32 %f395, %f395, %f310;
- add.s32 %r447, %r447, %r79;
- add.s32 %r446, %r446, %r2;
- add.s32 %r448, %r448, 1;
- setp.lt.s32 %p101, %r448, %r77;
+ add.s32 %r445, %r445, %r79;
+ add.s32 %r444, %r444, %r3;
+ add.s32 %r446, %r446, 1;
+ setp.lt.s32 %p101, %r446, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
- clz.b32 %r357, %r2;
- mov.u32 %r358, 31;
- sub.s32 %r359, %r358, %r357;
- mov.u32 %r360, 1;
- shl.b32 %r90, %r360, %r359;
- setp.lt.u32 %p102, %r5, %r90;
- add.s32 %r361, %r90, %r5;
- setp.lt.u32 %p103, %r361, %r2;
+ clz.b32 %r356, %r3;
+ mov.u32 %r357, 31;
+ sub.s32 %r358, %r357, %r356;
+ mov.u32 %r359, 1;
+ shl.b32 %r90, %r359, %r358;
+ setp.lt.u32 %p102, %r6, %r90;
+ add.s32 %r360, %r90, %r6;
+ setp.lt.u32 %p103, %r360, %r3;
and.pred %p7, %p102, %p103;
- add.s32 %r362, %r39, %r90;
- mul.wide.s32 %rd133, %r362, 4;
+ add.s32 %r361, %r39, %r90;
+ mul.wide.s32 %rd133, %r361, 4;
add.s64 %rd30, %rd45, %rd133;
- shr.u32 %r363, %r90, 31;
- add.s32 %r364, %r90, %r363;
- shr.s32 %r461, %r364, 1;
+ shr.u32 %r362, %r90, 31;
+ add.s32 %r363, %r90, %r362;
+ shr.s32 %r459, %r363, 1;
st.shared.f32 [%rd23], %f394;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
@@ -1120,49 +1118,49 @@
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
- mov.u32 %r451, %r461;
+ mov.u32 %r449, %r459;
$L__BB0_153:
- setp.ge.u32 %p106, %r5, %r451;
+ setp.ge.u32 %p106, %r6, %r449;
@%p106 bra $L__BB0_155;
- add.s32 %r365, %r451, %r39;
- mul.wide.s32 %rd135, %r365, 4;
+ add.s32 %r364, %r449, %r39;
+ mul.wide.s32 %rd135, %r364, 4;
add.s64 %rd137, %rd45, %rd135;
ld.shared.f32 %f314, [%rd23];
ld.shared.f32 %f315, [%rd137];
add.f32 %f316, %f315, %f314;
st.shared.f32 [%rd23], %f316;
$L__BB0_155:
bar.sync 0;
- shr.u32 %r93, %r451, 1;
- setp.gt.u32 %p107, %r451, 3;
- mov.u32 %r451, %r93;
+ shr.u32 %r93, %r449, 1;
+ setp.gt.u32 %p107, %r449, 3;
+ mov.u32 %r449, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
- add.s32 %r367, %r39, 1;
- mul.wide.u32 %rd138, %r367, 4;
+ add.s32 %r366, %r39, 1;
+ mul.wide.u32 %rd138, %r366, 4;
add.s64 %rd31, %rd45, %rd138;
- setp.ne.s32 %p108, %r5, 0;
- mov.u32 %r452, 0;
+ setp.ne.s32 %p108, %r6, 0;
+ mov.u32 %r450, 0;
@%p108 bra $L__BB0_160;
- setp.lt.u32 %p109, %r2, 2;
+ setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f317, [%rd23];
add.f32 %f396, %f317, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f318, [%rd31];
add.f32 %f396, %f396, %f318;
$L__BB0_159:
- mov.b32 %r452, %f396;
+ mov.b32 %r450, %f396;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f395;
bar.sync 0;
@@ -1175,158 +1173,158 @@
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
- mov.u32 %r453, %r461;
+ mov.u32 %r451, %r459;
$L__BB0_164:
- setp.ge.u32 %p112, %r5, %r453;
+ setp.ge.u32 %p112, %r6, %r451;
@%p112 bra $L__BB0_166;
- add.s32 %r368, %r453, %r39;
- mul.wide.s32 %rd140, %r368, 4;
+ add.s32 %r367, %r451, %r39;
+ mul.wide.s32 %rd140, %r367, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f322, [%rd23];
ld.shared.f32 %f323, [%rd142];
add.f32 %f324, %f323, %f322;
st.shared.f32 [%rd23], %f324;
$L__BB0_166:
bar.sync 0;
- shr.u32 %r97, %r453, 1;
- setp.gt.u32 %p113, %r453, 3;
- mov.u32 %r453, %r97;
+ shr.u32 %r97, %r451, 1;
+ setp.gt.u32 %p113, %r451, 3;
+ mov.u32 %r451, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
- mov.u32 %r454, 0;
+ mov.u32 %r452, 0;
@%p108 bra $L__BB0_171;
- setp.lt.u32 %p115, %r2, 2;
+ setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f325, [%rd23];
add.f32 %f397, %f325, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f326, [%rd31];
add.f32 %f397, %f397, %f326;
$L__BB0_170:
- mov.b32 %r454, %f397;
+ mov.b32 %r452, %f397;
$L__BB0_171:
bar.sync 0;
- setp.eq.s32 %p116, %r5, 0;
+ setp.eq.s32 %p116, %r6, 0;
@%p116 bra $L__BB0_172;
bra.uni $L__BB0_175;
$L__BB0_172:
- add.s32 %r370, %r176, 1;
- shr.u32 %r371, %r370, 31;
- add.s32 %r372, %r370, %r371;
- shr.s32 %r373, %r372, 1;
- add.s32 %r374, %r3, %r373;
- add.s32 %r375, %r374, -1;
- div.s32 %r376, %r375, %r3;
- setp.ge.s32 %p117, %r74, %r376;
+ add.s32 %r369, %r176, 1;
+ shr.u32 %r370, %r369, 31;
+ add.s32 %r371, %r369, %r370;
+ shr.s32 %r372, %r371, 1;
+ add.s32 %r373, %r4, %r372;
+ add.s32 %r374, %r373, -1;
+ div.s32 %r375, %r374, %r4;
+ setp.ge.s32 %p117, %r74, %r375;
@%p117 bra $L__BB0_175;
- shl.b32 %r100, %r7, 1;
- mul.lo.s32 %r377, %r3, %r74;
- shl.b32 %r101, %r377, 1;
- add.s32 %r378, %r100, %r101;
- or.b32 %r379, %r378, 1;
- setp.ge.s32 %p118, %r379, %r176;
+ shl.b32 %r100, %r8, 1;
+ mul.lo.s32 %r376, %r4, %r74;
+ shl.b32 %r101, %r376, 1;
+ add.s32 %r377, %r100, %r101;
+ or.b32 %r378, %r377, 1;
+ setp.ge.s32 %p118, %r378, %r176;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd160, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
- add.s32 %r382, %r101, %r100;
- mul.wide.s32 %rd144, %r382, 4;
+ add.s32 %r381, %r101, %r100;
+ mul.wide.s32 %rd144, %r381, 4;
add.s64 %rd143, %rd160, %rd144;
- st.global.cs.v2.s32 [%rd143], {%r452,%r454};
+ st.global.cs.v2.s32 [%rd143], {%r450,%r452};
$L__BB0_175:
- add.s32 %r383, %r176, 1;
- shr.u32 %r384, %r383, 31;
- add.s32 %r385, %r383, %r384;
- shr.s32 %r386, %r385, 1;
- add.s32 %r387, %r3, %r386;
- add.s32 %r388, %r387, -1;
- div.s32 %r102, %r388, %r3;
+ add.s32 %r382, %r176, 1;
+ shr.u32 %r383, %r382, 31;
+ add.s32 %r384, %r382, %r383;
+ shr.s32 %r385, %r384, 1;
+ add.s32 %r386, %r4, %r385;
+ add.s32 %r387, %r386, -1;
+ div.s32 %r102, %r387, %r4;
setp.ge.s32 %p119, %r74, %r102;
mov.f32 %f398, 0f00000000;
mov.f32 %f402, 0f00000000;
mov.f32 %f399, %f402;
@%p119 bra $L__BB0_178;
- shl.b32 %r103, %r7, 1;
- mul.lo.s32 %r389, %r3, %r74;
- shl.b32 %r104, %r389, 1;
- add.s32 %r390, %r103, %r104;
- or.b32 %r391, %r390, 1;
- setp.ge.s32 %p120, %r391, %r176;
+ shl.b32 %r103, %r8, 1;
+ mul.lo.s32 %r388, %r4, %r74;
+ shl.b32 %r104, %r388, 1;
+ add.s32 %r389, %r103, %r104;
+ or.b32 %r390, %r389, 1;
+ setp.ge.s32 %p120, %r390, %r176;
@%p120 bra $L__BB0_178;
ld.param.u64 %rd159, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5];
- add.s32 %r394, %r104, %r103;
- mul.wide.s32 %rd146, %r394, 4;
+ add.s32 %r393, %r104, %r103;
+ mul.wide.s32 %rd146, %r393, 4;
add.s64 %rd145, %rd159, %rd146;
- ld.global.cs.v2.u32 {%r392,%r393}, [%rd145];
-
- mov.b32 %f398, %r392;
- mov.b32 %f399, %r393;
+ ld.global.cs.v2.u32 {%r391,%r392}, [%rd145];
+
+ mov.b32 %f398, %r391;
+ mov.b32 %f399, %r392;
$L__BB0_178:
mov.f32 %f403, %f402;
@%p97 bra $L__BB0_184;
- shl.b32 %r396, %r7, 1;
- shl.b32 %r397, %r3, 1;
- mad.lo.s32 %r398, %r397, %r74, %r396;
- or.b32 %r399, %r398, 1;
- setp.ge.s32 %p122, %r399, %r176;
+ shl.b32 %r395, %r8, 1;
+ shl.b32 %r396, %r4, 1;
+ mad.lo.s32 %r397, %r396, %r74, %r395;
+ or.b32 %r398, %r397, 1;
+ setp.ge.s32 %p122, %r398, %r176;
or.pred %p8, %p122, %p119;
- mul.lo.s32 %r400, %r3, %r74;
- shl.b32 %r401, %r400, 1;
- mad.lo.s32 %r402, %r176, %r5, %r401;
- add.s32 %r456, %r402, %r396;
- mul.lo.s32 %r106, %r176, %r2;
- mov.u32 %r395, 0;
+ mul.lo.s32 %r399, %r4, %r74;
+ shl.b32 %r400, %r399, 1;
+ mad.lo.s32 %r401, %r176, %r6, %r400;
+ add.s32 %r454, %r401, %r395;
+ mul.lo.s32 %r106, %r176, %r3;
+ mov.u32 %r394, 0;
mov.f32 %f402, 0f00000000;
- mov.u32 %r455, %r5;
+ mov.u32 %r453, %r6;
mov.f32 %f403, %f402;
- mov.u32 %r457, %r395;
+ mov.u32 %r455, %r394;
$L__BB0_180:
.pragma "nounroll";
- mov.u32 %r458, %r395;
- mov.u32 %r459, %r395;
+ mov.u32 %r456, %r394;
+ mov.u32 %r457, %r394;
@%p8 bra $L__BB0_183;
- setp.ge.s32 %p124, %r455, %r8;
- mov.u32 %r458, %r395;
- mov.u32 %r459, %r395;
+ setp.ge.s32 %p124, %r453, %r9;
+ mov.u32 %r456, %r394;
+ mov.u32 %r457, %r394;
@%p124 bra $L__BB0_183;
- mul.wide.s32 %rd148, %r456, 4;
+ mul.wide.s32 %rd148, %r454, 4;
add.s64 %rd147, %rd43, %rd148;
- ld.volatile.global.v2.s32 {%r459,%r458}, [%rd147];
+ ld.volatile.global.v2.s32 {%r457,%r456}, [%rd147];
$L__BB0_183:
- mov.b32 %f335, %r459;
+ mov.b32 %f335, %r457;
add.f32 %f402, %f402, %f335;
- mov.b32 %f336, %r458;
+ mov.b32 %f336, %r456;
add.f32 %f403, %f403, %f336;
- add.s32 %r456, %r456, %r106;
- add.s32 %r455, %r455, %r2;
- add.s32 %r457, %r457, 1;
- setp.lt.s32 %p125, %r457, %r77;
+ add.s32 %r454, %r454, %r106;
+ add.s32 %r453, %r453, %r3;
+ add.s32 %r455, %r455, 1;
+ setp.lt.s32 %p125, %r455, %r77;
@%p125 bra $L__BB0_180;
$L__BB0_184:
st.shared.f32 [%rd23], %f402;
bar.sync 0;
@@ -1339,36 +1337,36 @@
$L__BB0_186:
bar.sync 0;
@%p105 bra $L__BB0_191;
- mov.u32 %r460, %r461;
+ mov.u32 %r458, %r459;
$L__BB0_188:
- setp.ge.u32 %p128, %r5, %r460;
+ setp.ge.u32 %p128, %r6, %r458;
@%p128 bra $L__BB0_190;
- add.s32 %r409, %r460, %r39;
- mul.wide.s32 %rd149, %r409, 4;
+ add.s32 %r408, %r458, %r39;
+ mul.wide.s32 %rd149, %r408, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f340, [%rd23];
ld.shared.f32 %f341, [%rd151];
add.f32 %f342, %f341, %f340;
st.shared.f32 [%rd23], %f342;
$L__BB0_190:
bar.sync 0;
- shr.u32 %r118, %r460, 1;
- setp.gt.u32 %p129, %r460, 3;
- mov.u32 %r460, %r118;
+ shr.u32 %r118, %r458, 1;
+ setp.gt.u32 %p129, %r458, 3;
+ mov.u32 %r458, %r118;
@%p129 bra $L__BB0_188;
$L__BB0_191:
mov.f32 %f404, 0f00000000;
@%p108 bra $L__BB0_194;
- setp.lt.u32 %p131, %r2, 2;
+ setp.lt.u32 %p131, %r3, 2;
ld.shared.f32 %f344, [%rd23];
add.f32 %f404, %f344, 0f00000000;
@%p131 bra $L__BB0_194;
ld.shared.f32 %f345, [%rd31];
@@ -1388,34 +1386,34 @@
$L__BB0_196:
bar.sync 0;
@%p105 bra $L__BB0_200;
$L__BB0_197:
- setp.ge.u32 %p134, %r5, %r461;
+ setp.ge.u32 %p134, %r6, %r459;
@%p134 bra $L__BB0_199;
- add.s32 %r410, %r461, %r39;
- mul.wide.s32 %rd152, %r410, 4;
+ add.s32 %r409, %r459, %r39;
+ mul.wide.s32 %rd152, %r409, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f349, [%rd23];
ld.shared.f32 %f350, [%rd154];
add.f32 %f351, %f350, %f349;
st.shared.f32 [%rd23], %f351;
$L__BB0_199:
bar.sync 0;
- shr.u32 %r120, %r461, 1;
- setp.gt.u32 %p135, %r461, 3;
- mov.u32 %r461, %r120;
+ shr.u32 %r120, %r459, 1;
+ setp.gt.u32 %p135, %r459, 3;
+ mov.u32 %r459, %r120;
@%p135 bra $L__BB0_197;
$L__BB0_200:
add.f32 %f96, %f404, %f398;
mov.f32 %f405, 0f00000000;
@%p108 bra $L__BB0_203;
- setp.lt.u32 %p137, %r2, 2;
+ setp.lt.u32 %p137, %r3, 2;
ld.shared.f32 %f353, [%rd23];
add.f32 %f405, %f353, 0f00000000;
@%p137 bra $L__BB0_203;
ld.shared.f32 %f354, [%rd31];
@@ -1424,34 +1422,34 @@
$L__BB0_203:
bar.sync 0;
or.pred %p140, %p108, %p119;
@%p140 bra $L__BB0_206;
- shl.b32 %r121, %r7, 1;
- mul.lo.s32 %r411, %r3, %r74;
- shl.b32 %r122, %r411, 1;
- add.s32 %r412, %r121, %r122;
- or.b32 %r413, %r412, 1;
- setp.ge.s32 %p141, %r413, %r176;
+ shl.b32 %r121, %r8, 1;
+ mul.lo.s32 %r410, %r4, %r74;
+ shl.b32 %r122, %r410, 1;
+ add.s32 %r411, %r121, %r122;
+ or.b32 %r412, %r411, 1;
+ setp.ge.s32 %p141, %r412, %r176;
@%p141 bra $L__BB0_206;
ld.param.u64 %rd162, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd158, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r418, %r122, %r121;
- mul.wide.s32 %rd157, %r418, 4;
+ add.s32 %r417, %r122, %r121;
+ mul.wide.s32 %rd157, %r417, 4;
add.s64 %rd155, %rd158, %rd157;
- mov.b32 %r415, %f405;
- mov.b32 %r414, %f404;
-
- st.global.cs.v2.s32 [%rd155], {%r414,%r415};
+ mov.b32 %r414, %f405;
+ mov.b32 %r413, %f404;
+
+ st.global.cs.v2.s32 [%rd155], {%r413,%r414};
add.s64 %rd156, %rd162, %rd157;
add.f32 %f355, %f405, %f399;
- mov.b32 %r417, %f355;
- mov.b32 %r416, %f96;
-
- st.global.cs.v2.s32 [%rd156], {%r416,%r417};
+ mov.b32 %r416, %f355;
+ mov.b32 %r415, %f96;
+
+ st.global.cs.v2.s32 [%rd156], {%r415,%r416};
$L__BB0_206:
ret;
Kernel 5
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 56
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T26, Tensor<float, 1, 1> T27, Tensor<float, 2, 2> T42, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<float, 2, 2> T66, Tensor<int64_t, 1, 1> T71) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T38 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T34 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T35 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T38) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
// Allocate global tensor T66
__syncthreads();
Array<float, 4, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T57[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T62[i7] = 0.000000000e+00f;
}
Array<float, 4, 4> T67;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T67[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T65;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9] = 0.000000000e+00f;
}
Array<float, 4, 1> T60;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
T60[i10] = 0.000000000e+00f;
}
Array<float, 4, 1> T55;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
T55[i11] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T49;
T49.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9]
= T65[i9]
+ T49[i9];
}
} else {
Array<float, 4, 4> T49;
T49.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9]
= T65[i9]
+ T49[i9];
}
}
Array<float, 1, 1> T36;
T36[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T36[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i12))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T36[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i12))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T35) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T37;
T37[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T37[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i12))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T37[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i12))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T37[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T70;
T70[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T53;
T53.set(float(0));
loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T8;
T8[0]
= T51[i10];
Array<float, 1, 1> T9;
T9[0]
= T47[i10]
* T8[0];
T54[0]
= T54[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T53[i10]
- T36[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T37[0];
Array<float, 1, 1> T21;
T21[0]
= T48[i10]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T70[0]
= T70[0]
+ T13[0];
T60[i10]
= T60[i10]
+ T21[0];
}
} else {
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T53;
T53.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T8;
T8[0]
= T51[i10];
Array<float, 1, 1> T9;
T9[0]
= T47[i10]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T54[0]
= T54[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T53[i10]
- T36[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T37[0];
Array<float, 1, 1> T21;
T21[0]
= T48[i10]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T70[0]
= T70[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T60[i10]
= T60[i10]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T70[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T39;
T39.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T40 = T46;
// Alias Allocation - register
auto& T43 = T52;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T39[i11];
Array<float, 1, 1> T25;
T25[0]
= T24[0]
+ T36[0];
T55[i11]
= T55[i11]
+ T25[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11];
Array<float, 1, 1> T29;
T29[0]
= T46[i11]
* T28[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T29[0];
Array<float, 1, 1> T31;
T31[0]
= T52[i11]
- T36[0];
Array<float, 1, 1> T32;
T32[0]
= T31[0]
* T37[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T32[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T40[i11]
= T20[0]
+ T25[0];
T43[i11]
= T20[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T26[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T40[0]);
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T42[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T43[0]);
} else {
Array<float, 4, 4> T39;
T39.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T52;
T52.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T50;
T50.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T50[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T40 = T46;
// Alias Allocation - register
auto& T43 = T52;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T39[i11];
Array<float, 1, 1> T25;
T25[0]
= T24[0]
+ T36[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T55[i11]
= T55[i11]
+ T25[0];
}
Array<float, 1, 1> T28;
T28[0]
= T50[i11];
Array<float, 1, 1> T29;
T29[0]
= T46[i11]
* T28[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T29[0];
Array<float, 1, 1> T31;
T31[0]
= T52[i11]
- T36[0];
Array<float, 1, 1> T32;
T32[0]
= T31[0]
* T37[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T32[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T40[i11]
= T20[0]
+ T25[0];
T43[i11]
= T20[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T26[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T40[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T42[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T43[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
blockReduce<false, true, false, true>(T67[i8], T65[i8], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T57[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T62[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T66[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T67[0]);
}
}
// Allocate global tensor T71
grid_sync::sync<false, true, false, true, true>(T71[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T69;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T69[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T68;
T68.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T68[0], &*(volatile float*)&T66[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T69[i13]
= T69[i13]
+ T68[i13];
}
}
Array<float, 2, 2> T45;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T45[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
blockReduce<true, false, false, true>(T45[i15], T69[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T45[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
}
Array<float, 2, 2> T44;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T44[i18] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
blockReduce<true, false, false, true>(T44[i18], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T44[0]);
}
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T59[i19] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i20 = 0; i20 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i20) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i20)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i20))]);
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T59[i19]
= T59[i19]
+ T58[i19];
}
}
Array<float, 2, 2> T41;
#pragma unroll
for(nvfuser_index_t i21 = 0; i21 < 2; ++i21) {
T41[i21] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i21 = 0; i21 < 2; ++i21) {
blockReduce<true, false, false, true>(T41[i21], T59[i21], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T27[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T41[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T0, Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 2, 2> T26, Tensor<float, 1, 1> T27, Tensor<float, 2, 2> T42, Tensor<float, 1, 1> T22, Tensor<float, 1, 1> T23, Tensor<float, 2, 2> T56, Tensor<float, 2, 2> T61, Tensor<float, 2, 2> T66, Tensor<int64_t, 1, 1> T71) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T38 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T34 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T35 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T38) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T56
// Allocate global tensor T61
// Allocate global tensor T66
__syncthreads();
Array<float, 4, 4> T57;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T57[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T62;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T62[i7] = 0.000000000e+00f;
}
Array<float, 4, 4> T67;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T67[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T65;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9] = 0.000000000e+00f;
}
Array<float, 4, 1> T60;
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
T60[i10] = 0.000000000e+00f;
}
Array<float, 4, 1> T55;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
T55[i11] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i12 = 0; i12 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i12) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T49;
T49.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9]
= T65[i9]
+ T49[i9];
}
} else {
Array<float, 4, 4> T49;
T49.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9]
= T65[i9]
+ T49[i9];
}
}
Array<float, 1, 1> T36;
T36[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T36[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i12))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T36[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i12))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T35) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T37;
T37[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T37[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i12))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64)) {
T37[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i12))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T37[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T54;
T54[0] = 0.000000000e+00f;
Array<float, 1, 1> T70;
T70[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T53;
T53.set(float(0));
loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T8;
T8[0]
= T51[i10];
Array<float, 1, 1> T9;
T9[0]
= T47[i10]
* T8[0];
T54[0]
= T54[0]
+ T9[0];
Array<float, 1, 1> T6;
T6[0]
= T53[i10]
- T36[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T37[0];
Array<float, 1, 1> T21;
T21[0]
= T48[i10]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T70[0]
= T70[0]
+ T13[0];
T60[i10]
= T60[i10]
+ T21[0];
}
} else {
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T53;
T53.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T8;
T8[0]
= T51[i10];
Array<float, 1, 1> T9;
T9[0]
= T47[i10]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T54[0]
= T54[0]
+ T9[0];
}
Array<float, 1, 1> T6;
T6[0]
= T53[i10]
- T36[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T37[0];
Array<float, 1, 1> T21;
T21[0]
= T48[i10]
* T7[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T70[0]
= T70[0]
+ T13[0];
}
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T60[i10]
= T60[i10]
+ T21[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T54[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T70[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T39;
T39.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T40 = T46;
// Alias Allocation - register
auto& T43 = T52;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T39[i11];
Array<float, 1, 1> T25;
T25[0]
= T24[0]
+ T36[0];
T55[i11]
= T55[i11]
+ T25[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11];
Array<float, 1, 1> T29;
T29[0]
= T46[i11]
* T28[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T29[0];
Array<float, 1, 1> T31;
T31[0]
= T52[i11]
- T36[0];
Array<float, 1, 1> T32;
T32[0]
= T31[0]
* T37[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T32[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T40[i11]
= T20[0]
+ T25[0];
T43[i11]
= T20[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T26[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T40[0]);
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T42[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T43[0]);
} else {
Array<float, 4, 4> T39;
T39.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T52;
T52.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T50;
T50.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T50[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T40 = T46;
// Alias Allocation - register
auto& T43 = T52;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T24;
T24[0]
= T39[i11];
Array<float, 1, 1> T25;
T25[0]
= T24[0]
+ T36[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
T55[i11]
= T55[i11]
+ T25[0];
}
Array<float, 1, 1> T28;
T28[0]
= T50[i11];
Array<float, 1, 1> T29;
T29[0]
= T46[i11]
* T28[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T29[0];
Array<float, 1, 1> T31;
T31[0]
= T52[i11]
- T36[0];
Array<float, 1, 1> T32;
T32[0]
= T31[0]
* T37[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T32[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
Array<float, 1, 1> T20;
T20[0]
= T19[0]
* T18[0];
T40[i11]
= T20[0]
+ T25[0];
T43[i11]
= T20[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T26[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T40[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T42[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))], &T43[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T57[i6], T55[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T62[i7], T60[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
blockReduce<false, true, false, true>(T67[i8], T65[i8], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T56[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T57[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T61[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T62[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T66[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T67[0]);
}
}
// Allocate global tensor T71
grid_sync::sync<false, true, false, true, true>(T71[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T69;
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T69[i13] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i14 = 0; i14 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i14) {
Array<float, 2, 2> T68;
T68.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i14)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T68[0], &*(volatile float*)&T66[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i14))]);
}
#pragma unroll
for(nvfuser_index_t i13 = 0; i13 < 2; ++i13) {
T69[i13]
= T69[i13]
+ T68[i13];
}
}
Array<float, 2, 2> T45;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T45[i15] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
blockReduce<true, false, false, true>(T45[i15], T69[i15], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T23[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T45[0]);
}
Array<float, 2, 1> T64;
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i17 = 0; i17 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i17) {
Array<float, 2, 2> T63;
T63.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i17)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T63[0], &*(volatile float*)&T61[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i17))]);
}
#pragma unroll
for(nvfuser_index_t i16 = 0; i16 < 2; ++i16) {
T64[i16]
= T64[i16]
+ T63[i16];
}
}
Array<float, 2, 2> T44;
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
T44[i18] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i18 = 0; i18 < 2; ++i18) {
blockReduce<true, false, false, true>(T44[i18], T64[i18], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T44[0]);
}
Array<float, 2, 1> T59;
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T59[i19] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i20 = 0; i20 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i20) {
Array<float, 2, 2> T58;
T58.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i20)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T58[0], &*(volatile float*)&T56[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i20))]);
}
#pragma unroll
for(nvfuser_index_t i19 = 0; i19 < 2; ++i19) {
T59[i19]
= T59[i19]
+ T58[i19];
}
}
Array<float, 2, 2> T41;
#pragma unroll
for(nvfuser_index_t i21 = 0; i21 < 2; ++i21) {
T41[i21] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i21 = 0; i21 < 2; ++i21) {
blockReduce<true, false, false, true>(T41[i21], T59[i21], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T27[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T41[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -78,32 +78,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T49;
T49.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9]
= T65[i9]
+ T49[i9];
}
} else {
Array<float, 4, 4> T49;
T49.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
- loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T65[i9]
= T65[i9]
@@ -127,11 +127,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T35) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T35) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i12))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -160,20 +160,20 @@
Array<float, 1, 1> T70;
T70[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
Array<float, 4, 4> T48;
T48.set(float(0));
- loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T53;
T53.set(float(0));
- loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
- loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T8;
T8[0]
= T51[i10];
@@ -209,26 +209,26 @@
}
} else {
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
- loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T53;
T53.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
- loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T53[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
- loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i10 = 0; i10 < 4; ++i10) {
Array<float, 1, 1> T8;
T8[0]
@@ -280,17 +280,17 @@
Array<float, 4, 4> T39;
T39.set(float(0));
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T52;
T52.set(float(0));
- loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T46;
T46.set(float(0));
- loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T40 = T46;
// Alias Allocation - register
auto& T43 = T52;
#pragma unroll
@@ -355,21 +355,21 @@
loadGlobalToLocal<float, /*vec_size=*/4, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T52;
T52.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
- loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T52[0], &T35[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T50;
T50.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T50[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i12)) < 64))) {
- loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T40 = T46;
// Alias Allocation - register
auto& T43 = T52;
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_12[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_13[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_14[16]
)
{
.reg .pred %p<190>;
.reg .f32 %f<526>;
.reg .b32 %r<602>;
.reg .f64 %fd<3>;
.reg .b64 %rd<190>;
ld.param.v2.u32 {%r210, %r211}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r220, %r221}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r224, %r225}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd48, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_13];
ld.param.u64 %rd47, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_12];
ld.param.u64 %rd46, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd43, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd41, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5];
ld.param.u64 %rd39, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd38, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd37, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r258, %r211, 3;
shr.s32 %r259, %r258, 31;
shr.u32 %r260, %r259, 30;
add.s32 %r261, %r258, %r260;
shr.s32 %r262, %r261, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r263, %r262, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r264, %r3, 2;
mad.lo.s32 %r265, %r264, %r263, 15;
and.b32 %r266, %r265, -16;
cvt.u64.u32 %rd1, %r266;
mul.lo.s32 %r267, %r3, %r262;
shl.b32 %r268, %r267, 4;
or.b32 %r269, %r268, 15;
and.b32 %r4, %r269, -16;
add.s32 %r270, %r269, %r4;
and.b32 %r271, %r270, -16;
cvt.s64.s32 %rd2, %r271;
mov.u64 %rd50, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_103395arrayE;
cvta.shared.u64 %rd51, %rd50;
add.s64 %rd3, %rd51, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p10, %r5, %r262;
shl.b32 %r6, %r5, 2;
or.b32 %r272, %r6, 3;
setp.lt.s32 %p11, %r272, %r211;
and.pred %p1, %p11, %p10;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p12, %r7, 0;
and.pred %p2, %p12, %p1;
not.pred %p13, %p2;
@%p13 bra $L__BB0_2;
add.s64 %rd52, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd52; cvt.u32.u64 %r273, smem_ptr; }
// end inline asm
shl.b32 %r276, %r5, 4;
add.s32 %r274, %r273, %r276;
mul.wide.s32 %rd54, %r6, 4;
add.s64 %rd53, %rd39, %rd54;
mov.u32 %r275, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r275, 0;
cp.async.ca.shared.global [%r274], [%rd53], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r277, %r3, 63;
div.s32 %r278, %r277, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r279, %r8, %r278;
add.s32 %r280, %r279, -1;
div.s32 %r9, %r280, %r8;
setp.gt.s32 %p14, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p14 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r211;
cvt.s64.s32 %rd55, %r4;
add.s64 %rd56, %rd1, %rd55;
add.s64 %rd58, %rd50, %rd1;
mov.u32 %r282, %ctaid.y;
mul.lo.s32 %r283, %r9, %r3;
mul.lo.s32 %r10, %r283, %r282;
shl.b32 %r284, %r7, 2;
shl.b32 %r285, %r5, 4;
mad.lo.s32 %r11, %r284, %r211, %r285;
mul.lo.s32 %r286, %r211, %r7;
cvt.s64.s32 %rd59, %r286;
cvt.s64.s32 %rd60, %r6;
add.s64 %rd5, %rd59, %rd60;
mul.lo.s32 %r287, %r10, %r211;
cvt.s64.s32 %rd6, %r287;
mul.lo.s32 %r12, %r211, %r3;
mul.lo.s32 %r13, %r9, %r282;
add.s32 %r14, %r286, %r6;
add.s64 %rd61, %rd50, %rd56;
mul.wide.s32 %rd62, %r14, 4;
add.s64 %rd7, %rd61, %rd62;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r288, %tid.z;
mad.lo.s32 %r289, %r3, %r288, %r7;
mad.lo.s32 %r15, %r289, %r2, %r5;
mul.wide.u32 %rd63, %r15, 4;
add.s64 %rd8, %rd50, %rd63;
clz.b32 %r290, %r2;
mov.u32 %r291, 31;
sub.s32 %r292, %r291, %r290;
mov.u32 %r293, 1;
shl.b32 %r16, %r293, %r292;
setp.lt.u32 %p15, %r5, %r16;
add.s32 %r294, %r16, %r5;
setp.lt.u32 %p16, %r294, %r2;
and.pred %p3, %p15, %p16;
add.s32 %r295, %r15, %r16;
mul.wide.s32 %rd64, %r295, 4;
add.s64 %rd9, %rd50, %rd64;
shr.u32 %r296, %r16, 31;
add.s32 %r297, %r16, %r296;
shr.s32 %r17, %r297, 1;
add.s64 %rd10, %rd58, %rd62;
add.s32 %r298, %r15, 1;
mul.wide.u32 %rd65, %r298, 4;
add.s64 %rd11, %rd50, %rd65;
add.s64 %rd66, %rd50, %rd4;
mul.wide.s32 %rd67, %r6, 4;
add.s64 %rd12, %rd66, %rd67;
mul.wide.s32 %rd68, %r289, 4;
add.s64 %rd13, %rd50, %rd68;
add.s64 %rd14, %rd40, %rd67;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd17, %rd37;
cvta.to.global.u64 %rd18, %rd38;
add.s64 %rd21, %rd51, %rd56;
mov.u32 %r543, 0;
mov.f32 %f480, 0f00000000;
not.pred %p17, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd21; cvt.u32.u64 %r301, smem_ptr; }
// end inline asm
add.s32 %r302, %r11, %r301;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r327, smem_ptr; }
// end inline asm
add.s32 %r328, %r11, %r327;
not.pred %p27, %p3;
mov.f32 %f481, %f480;
mov.f32 %f482, %f480;
mov.f32 %f483, %f480;
mov.f32 %f488, %f480;
mov.f32 %f489, %f480;
mov.f32 %f490, %f480;
mov.f32 %f491, %f480;
mov.f32 %f492, %f480;
mov.f32 %f493, %f480;
mov.f32 %f494, %f480;
mov.f32 %f495, %f480;
$L__BB0_5:
.pragma "nounroll";
@%p17 bra $L__BB0_8;
mad.lo.s32 %r299, %r543, %r3, %r7;
add.s32 %r300, %r299, %r10;
setp.gt.s32 %p18, %r300, 63;
@%p18 bra $L__BB0_8;
mul.lo.s32 %r304, %r12, %r543;
cvt.s64.s32 %rd72, %r304;
add.s64 %rd73, %rd5, %rd72;
add.s64 %rd74, %rd73, %rd6;
shl.b64 %rd75, %rd74, 2;
add.s64 %rd71, %rd35, %rd75;
mov.u32 %r303, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r303, 0;
cp.async.ca.shared.global [%r302], [%rd71], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p17 bra $L__BB0_10;
add.s32 %r305, %r13, %r543;
mad.lo.s32 %r306, %r305, %r3, %r7;
setp.lt.s32 %p20, %r306, 64;
@%p20 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r544, %r545, %r546, %r547}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r315, %r13, %r543;
mad.lo.s32 %r316, %r315, %r3, %r7;
setp.gt.s32 %p21, %r316, 63;
mov.u32 %r544, 0;
mov.u32 %r545, %r544;
mov.u32 %r546, %r544;
mov.u32 %r547, %r544;
@%p21 bra $L__BB0_15;
ld.shared.v4.u32 {%r544, %r545, %r546, %r547}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r544, 0;
mov.u32 %r545, %r544;
mov.u32 %r546, %r544;
mov.u32 %r547, %r544;
$L__BB0_15:
add.s32 %r325, %r13, %r543;
mad.lo.s32 %r33, %r325, %r3, %r7;
mov.b32 %f160, %r547;
add.f32 %f495, %f495, %f160;
mov.b32 %f161, %r546;
add.f32 %f494, %f494, %f161;
mov.b32 %f162, %r545;
add.f32 %f493, %f493, %f162;
mov.b32 %f163, %r544;
add.f32 %f492, %f492, %f163;
setp.gt.s32 %p22, %r33, 63;
mov.f32 %f470, 0f00000000;
@%p22 bra $L__BB0_17;
mul.lo.s32 %r326, %r33, %r220;
mul.wide.s32 %rd76, %r326, 4;
add.s64 %rd77, %rd17, %rd76;
ld.global.f32 %f470, [%rd77];
$L__BB0_17:
setp.lt.s32 %p23, %r33, 64;
and.pred %p4, %p1, %p23;
not.pred %p24, %p4;
@%p24 bra $L__BB0_19;
mul.lo.s32 %r330, %r12, %r543;
cvt.s64.s32 %rd80, %r330;
add.s64 %rd81, %rd5, %rd80;
add.s64 %rd82, %rd81, %rd6;
shl.b64 %rd83, %rd82, 2;
add.s64 %rd79, %rd36, %rd83;
mov.u32 %r329, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r329, 0;
cp.async.ca.shared.global [%r328], [%rd79], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r542, %r13, %r543;
mad.lo.s32 %r541, %r542, %r3, %r7;
setp.gt.s32 %p189, %r541, 63;
mov.f32 %f476, 0f00000000;
mov.f32 %f471, %f476;
@%p189 bra $L__BB0_21;
mul.lo.s32 %r331, %r33, %r224;
mul.wide.s32 %rd84, %r331, 4;
add.s64 %rd85, %rd18, %rd84;
ld.global.f32 %f471, [%rd85];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f477, %f476;
@%p24 bra $L__BB0_23;
ld.shared.v4.f32 {%f167, %f168, %f169, %f170}, [%rd12];
ld.shared.v4.f32 {%f172, %f173, %f174, %f175}, [%rd7];
mul.f32 %f177, %f167, %f172;
add.f32 %f178, %f177, 0f00000000;
ld.shared.v4.f32 {%f179, %f180, %f181, %f182}, [%rd10];
sub.f32 %f184, %f179, %f470;
mul.f32 %f185, %f471, %f184;
fma.rn.f32 %f186, %f177, %f185, 0f00000000;
fma.rn.f32 %f488, %f185, %f172, %f488;
mul.f32 %f189, %f168, %f173;
add.f32 %f190, %f178, %f189;
sub.f32 %f192, %f180, %f470;
mul.f32 %f193, %f471, %f192;
fma.rn.f32 %f194, %f189, %f193, %f186;
fma.rn.f32 %f489, %f193, %f173, %f489;
mul.f32 %f197, %f169, %f174;
add.f32 %f198, %f190, %f197;
sub.f32 %f200, %f181, %f470;
mul.f32 %f201, %f471, %f200;
fma.rn.f32 %f202, %f197, %f201, %f194;
fma.rn.f32 %f490, %f201, %f174, %f490;
mul.f32 %f205, %f170, %f175;
add.f32 %f477, %f198, %f205;
sub.f32 %f207, %f182, %f470;
mul.f32 %f208, %f471, %f207;
fma.rn.f32 %f476, %f205, %f208, %f202;
fma.rn.f32 %f491, %f208, %f175, %f491;
$L__BB0_23:
st.shared.f32 [%rd8], %f477;
bar.sync 0;
@%p27 bra $L__BB0_25;
ld.shared.f32 %f209, [%rd9];
ld.shared.f32 %f210, [%rd8];
add.f32 %f211, %f209, %f210;
st.shared.f32 [%rd8], %f211;
$L__BB0_25:
setp.lt.s32 %p28, %r16, 4;
bar.sync 0;
@%p28 bra $L__BB0_30;
mov.u32 %r548, %r17;
$L__BB0_27:
setp.ge.u32 %p29, %r5, %r548;
@%p29 bra $L__BB0_29;
add.s32 %r332, %r548, %r15;
mul.wide.s32 %rd86, %r332, 4;
add.s64 %rd88, %rd50, %rd86;
ld.shared.f32 %f212, [%rd8];
ld.shared.f32 %f213, [%rd88];
add.f32 %f214, %f213, %f212;
st.shared.f32 [%rd8], %f214;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r548, 1;
setp.gt.u32 %p30, %r548, 3;
mov.u32 %r548, %r35;
@%p30 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p31, %r5, 0;
mov.f32 %f478, 0f00000000;
@%p31 bra $L__BB0_33;
setp.lt.u32 %p32, %r2, 2;
ld.shared.f32 %f216, [%rd8];
add.f32 %f478, %f216, 0f00000000;
@%p32 bra $L__BB0_33;
ld.shared.f32 %f217, [%rd11];
add.f32 %f478, %f478, %f217;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f476;
bar.sync 0;
@%p27 bra $L__BB0_35;
ld.shared.f32 %f218, [%rd9];
ld.shared.f32 %f219, [%rd8];
add.f32 %f220, %f218, %f219;
st.shared.f32 [%rd8], %f220;
$L__BB0_35:
setp.lt.s32 %p184, %r16, 4;
bar.sync 0;
@%p184 bra $L__BB0_40;
mov.u32 %r549, %r17;
$L__BB0_37:
setp.ge.u32 %p35, %r5, %r549;
@%p35 bra $L__BB0_39;
add.s32 %r333, %r549, %r15;
mul.wide.s32 %rd89, %r333, 4;
add.s64 %rd91, %rd50, %rd89;
ld.shared.f32 %f221, [%rd8];
ld.shared.f32 %f222, [%rd91];
add.f32 %f223, %f222, %f221;
st.shared.f32 [%rd8], %f223;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r549, 1;
setp.gt.u32 %p36, %r549, 3;
mov.u32 %r549, %r37;
@%p36 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f479, 0f00000000;
@%p31 bra $L__BB0_43;
setp.lt.u32 %p38, %r2, 2;
ld.shared.f32 %f225, [%rd8];
add.f32 %f479, %f225, 0f00000000;
@%p38 bra $L__BB0_43;
ld.shared.f32 %f226, [%rd11];
add.f32 %f479, %f479, %f226;
$L__BB0_43:
bar.sync 0;
@%p31 bra $L__BB0_45;
st.shared.f32 [%rd13], %f478;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f41, [%rd13];
bar.sync 0;
@%p31 bra $L__BB0_47;
st.shared.f32 [%rd13], %f479;
$L__BB0_47:
add.s32 %r540, %r13, %r543;
mad.lo.s32 %r539, %r540, %r3, %r7;
setp.lt.s32 %p188, %r539, 64;
and.pred %p187, %p1, %p188;
bar.sync 0;
ld.shared.f32 %f42, [%rd13];
bar.sync 0;
@%p187 bra $L__BB0_50;
bra.uni $L__BB0_48;
$L__BB0_50:
mul.f32 %f227, %f471, %f1;
// begin inline asm
ld.global.cs.v4.u32 {%r338,%r339,%r340,%r341}, [%rd14];
// end inline asm
mov.b32 %f228, %r338;
add.f32 %f229, %f470, %f228;
add.f32 %f480, %f480, %f229;
ld.shared.v4.f32 {%f230, %f231, %f232, %f233}, [%rd12];
ld.shared.v4.f32 {%f235, %f236, %f237, %f238}, [%rd7];
mul.f32 %f240, %f230, %f235;
mul.f32 %f241, %f240, %f2;
ld.shared.v4.f32 {%f242, %f243, %f244, %f245}, [%rd10];
sub.f32 %f247, %f242, %f470;
mul.f32 %f248, %f471, %f247;
sub.f32 %f249, %f241, %f41;
mul.f32 %f250, %f42, %f248;
sub.f32 %f251, %f249, %f250;
mul.f32 %f252, %f227, %f251;
add.f32 %f253, %f229, %f252;
mov.b32 %r342, %f253;
mov.b32 %r346, %f252;
mov.b32 %f254, %r339;
add.f32 %f255, %f470, %f254;
add.f32 %f481, %f481, %f255;
mul.f32 %f258, %f231, %f236;
mul.f32 %f259, %f258, %f2;
sub.f32 %f261, %f243, %f470;
mul.f32 %f262, %f471, %f261;
sub.f32 %f263, %f259, %f41;
mul.f32 %f264, %f42, %f262;
sub.f32 %f265, %f263, %f264;
mul.f32 %f266, %f227, %f265;
add.f32 %f267, %f255, %f266;
mov.b32 %r343, %f267;
mov.b32 %r347, %f266;
mov.b32 %f268, %r340;
add.f32 %f269, %f470, %f268;
add.f32 %f482, %f482, %f269;
mul.f32 %f272, %f232, %f237;
mul.f32 %f273, %f272, %f2;
sub.f32 %f275, %f244, %f470;
mul.f32 %f276, %f471, %f275;
sub.f32 %f277, %f273, %f41;
mul.f32 %f278, %f42, %f276;
sub.f32 %f279, %f277, %f278;
mul.f32 %f280, %f227, %f279;
add.f32 %f281, %f269, %f280;
mov.b32 %r344, %f281;
mov.b32 %r348, %f280;
mov.b32 %f282, %r341;
add.f32 %f283, %f470, %f282;
add.f32 %f483, %f483, %f283;
mul.f32 %f286, %f233, %f238;
mul.f32 %f287, %f286, %f2;
sub.f32 %f289, %f245, %f470;
mul.f32 %f290, %f471, %f289;
sub.f32 %f291, %f287, %f41;
mul.f32 %f292, %f42, %f290;
sub.f32 %f293, %f291, %f292;
mul.f32 %f294, %f227, %f293;
add.f32 %f295, %f283, %f294;
mov.b32 %r345, %f295;
mov.b32 %r349, %f294;
mad.lo.s32 %r350, %r543, %r3, %r10;
mad.lo.s32 %r351, %r350, %r211, %r14;
mul.wide.s32 %rd96, %r351, 4;
add.s64 %rd94, %rd41, %rd96;
// begin inline asm
st.global.cs.v4.s32 [%rd94], {%r342,%r343,%r344,%r345};
// end inline asm
add.s64 %rd95, %rd43, %rd96;
// begin inline asm
st.global.cs.v4.s32 [%rd95], {%r346,%r347,%r348,%r349};
// end inline asm
bra.uni $L__BB0_51;
$L__BB0_48:
@%p17 bra $L__BB0_51;
// begin inline asm
ld.global.cs.v4.u32 {%r334,%r335,%r336,%r337}, [%rd14];
// end inline asm
$L__BB0_51:
add.s32 %r543, %r543, 1;
setp.lt.s32 %p42, %r543, %r9;
@%p42 bra $L__BB0_5;
bra.uni $L__BB0_52;
$L__BB0_3:
mov.f32 %f480, 0f00000000;
mov.f32 %f481, %f480;
mov.f32 %f482, %f480;
mov.f32 %f483, %f480;
mov.f32 %f488, %f480;
mov.f32 %f489, %f480;
mov.f32 %f490, %f480;
mov.f32 %f491, %f480;
mov.f32 %f492, %f480;
mov.f32 %f493, %f480;
mov.f32 %f494, %f480;
mov.f32 %f495, %f480;
$L__BB0_52:
mov.u32 %r352, %tid.z;
mad.lo.s32 %r353, %r3, %r352, %r7;
mad.lo.s32 %r39, %r353, %r2, %r5;
mul.wide.u32 %rd97, %r39, 4;
add.s64 %rd25, %rd50, %rd97;
clz.b32 %r354, %r3;
mov.u32 %r355, 31;
sub.s32 %r356, %r355, %r354;
mov.u32 %r357, 1;
shl.b32 %r40, %r357, %r356;
setp.lt.u32 %p43, %r7, %r40;
add.s32 %r358, %r40, %r7;
setp.lt.u32 %p44, %r358, %r3;
and.pred %p5, %p43, %p44;
shl.b32 %r359, %r2, %r356;
add.s32 %r360, %r39, %r359;
mul.wide.s32 %rd99, %r360, 4;
add.s64 %rd26, %rd50, %rd99;
shr.u32 %r361, %r40, 31;
add.s32 %r362, %r40, %r361;
shr.s32 %r572, %r362, 1;
st.shared.f32 [%rd25], %f480;
bar.sync 0;
not.pred %p45, %p5;
@%p45 bra $L__BB0_54;
ld.shared.f32 %f296, [%rd26];
ld.shared.f32 %f297, [%rd25];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd25], %f298;
$L__BB0_54:
setp.lt.s32 %p46, %r40, 4;
bar.sync 0;
@%p46 bra $L__BB0_59;
mov.u32 %r550, %r572;
$L__BB0_56:
setp.ge.u32 %p47, %r7, %r550;
@%p47 bra $L__BB0_58;
mad.lo.s32 %r363, %r550, %r2, %r39;
mul.wide.s32 %rd100, %r363, 4;
add.s64 %rd102, %rd50, %rd100;
ld.shared.f32 %f299, [%rd25];
ld.shared.f32 %f300, [%rd102];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd25], %f301;
$L__BB0_58:
bar.sync 0;
shr.u32 %r43, %r550, 1;
setp.gt.u32 %p48, %r550, 3;
mov.u32 %r550, %r43;
@%p48 bra $L__BB0_56;
$L__BB0_59:
add.s32 %r365, %r39, %r2;
mul.wide.u32 %rd103, %r365, 4;
add.s64 %rd27, %rd50, %rd103;
setp.ne.s32 %p49, %r7, 0;
mov.u32 %r551, 0;
@%p49 bra $L__BB0_63;
setp.lt.u32 %p50, %r3, 2;
ld.shared.f32 %f302, [%rd25];
add.f32 %f496, %f302, 0f00000000;
@%p50 bra $L__BB0_62;
ld.shared.f32 %f303, [%rd27];
add.f32 %f496, %f496, %f303;
$L__BB0_62:
mov.b32 %r551, %f496;
$L__BB0_63:
bar.sync 0;
st.shared.f32 [%rd25], %f481;
bar.sync 0;
@%p45 bra $L__BB0_65;
ld.shared.f32 %f304, [%rd26];
ld.shared.f32 %f305, [%rd25];
add.f32 %f306, %f304, %f305;
st.shared.f32 [%rd25], %f306;
$L__BB0_65:
bar.sync 0;
@%p46 bra $L__BB0_70;
mov.u32 %r552, %r572;
$L__BB0_67:
setp.ge.u32 %p53, %r7, %r552;
@%p53 bra $L__BB0_69;
mad.lo.s32 %r366, %r552, %r2, %r39;
mul.wide.s32 %rd105, %r366, 4;
add.s64 %rd107, %rd50, %rd105;
ld.shared.f32 %f307, [%rd25];
ld.shared.f32 %f308, [%rd107];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd25], %f309;
$L__BB0_69:
bar.sync 0;
shr.u32 %r47, %r552, 1;
setp.gt.u32 %p54, %r552, 3;
mov.u32 %r552, %r47;
@%p54 bra $L__BB0_67;
$L__BB0_70:
mov.u32 %r553, 0;
@%p49 bra $L__BB0_74;
setp.lt.u32 %p56, %r3, 2;
ld.shared.f32 %f310, [%rd25];
add.f32 %f497, %f310, 0f00000000;
@%p56 bra $L__BB0_73;
ld.shared.f32 %f311, [%rd27];
add.f32 %f497, %f497, %f311;
$L__BB0_73:
mov.b32 %r553, %f497;
$L__BB0_74:
bar.sync 0;
st.shared.f32 [%rd25], %f482;
bar.sync 0;
@%p45 bra $L__BB0_76;
ld.shared.f32 %f312, [%rd26];
ld.shared.f32 %f313, [%rd25];
add.f32 %f314, %f312, %f313;
st.shared.f32 [%rd25], %f314;
$L__BB0_76:
bar.sync 0;
@%p46 bra $L__BB0_81;
mov.u32 %r554, %r572;
$L__BB0_78:
setp.ge.u32 %p59, %r7, %r554;
@%p59 bra $L__BB0_80;
mad.lo.s32 %r368, %r554, %r2, %r39;
mul.wide.s32 %rd108, %r368, 4;
add.s64 %rd110, %rd50, %rd108;
ld.shared.f32 %f315, [%rd25];
ld.shared.f32 %f316, [%rd110];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd25], %f317;
$L__BB0_80:
bar.sync 0;
shr.u32 %r51, %r554, 1;
setp.gt.u32 %p60, %r554, 3;
mov.u32 %r554, %r51;
@%p60 bra $L__BB0_78;
$L__BB0_81:
mov.u32 %r555, 0;
@%p49 bra $L__BB0_85;
setp.lt.u32 %p62, %r3, 2;
ld.shared.f32 %f318, [%rd25];
add.f32 %f498, %f318, 0f00000000;
@%p62 bra $L__BB0_84;
ld.shared.f32 %f319, [%rd27];
add.f32 %f498, %f498, %f319;
$L__BB0_84:
mov.b32 %r555, %f498;
$L__BB0_85:
bar.sync 0;
st.shared.f32 [%rd25], %f483;
bar.sync 0;
@%p45 bra $L__BB0_87;
ld.shared.f32 %f320, [%rd26];
ld.shared.f32 %f321, [%rd25];
add.f32 %f322, %f320, %f321;
st.shared.f32 [%rd25], %f322;
$L__BB0_87:
bar.sync 0;
@%p46 bra $L__BB0_92;
mov.u32 %r556, %r572;
$L__BB0_89:
setp.ge.u32 %p65, %r7, %r556;
@%p65 bra $L__BB0_91;
mad.lo.s32 %r370, %r556, %r2, %r39;
mul.wide.s32 %rd111, %r370, 4;
add.s64 %rd113, %rd50, %rd111;
ld.shared.f32 %f323, [%rd25];
ld.shared.f32 %f324, [%rd113];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd25], %f325;
$L__BB0_91:
bar.sync 0;
shr.u32 %r55, %r556, 1;
setp.gt.u32 %p66, %r556, 3;
mov.u32 %r556, %r55;
@%p66 bra $L__BB0_89;
$L__BB0_92:
mov.u32 %r557, 0;
@%p49 bra $L__BB0_96;
setp.lt.u32 %p68, %r3, 2;
ld.shared.f32 %f326, [%rd25];
add.f32 %f499, %f326, 0f00000000;
@%p68 bra $L__BB0_95;
ld.shared.f32 %f327, [%rd27];
add.f32 %f499, %f499, %f327;
$L__BB0_95:
mov.b32 %r557, %f499;
$L__BB0_96:
bar.sync 0;
st.shared.f32 [%rd25], %f488;
bar.sync 0;
@%p45 bra $L__BB0_98;
ld.shared.f32 %f328, [%rd26];
ld.shared.f32 %f329, [%rd25];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd25], %f330;
$L__BB0_98:
bar.sync 0;
@%p46 bra $L__BB0_103;
mov.u32 %r558, %r572;
$L__BB0_100:
setp.ge.u32 %p71, %r7, %r558;
@%p71 bra $L__BB0_102;
mad.lo.s32 %r372, %r558, %r2, %r39;
mul.wide.s32 %rd114, %r372, 4;
add.s64 %rd116, %rd50, %rd114;
ld.shared.f32 %f331, [%rd25];
ld.shared.f32 %f332, [%rd116];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd25], %f333;
$L__BB0_102:
bar.sync 0;
shr.u32 %r59, %r558, 1;
setp.gt.u32 %p72, %r558, 3;
mov.u32 %r558, %r59;
@%p72 bra $L__BB0_100;
$L__BB0_103:
mov.u32 %r559, 0;
@%p49 bra $L__BB0_107;
setp.lt.u32 %p74, %r3, 2;
ld.shared.f32 %f334, [%rd25];
add.f32 %f500, %f334, 0f00000000;
@%p74 bra $L__BB0_106;
ld.shared.f32 %f335, [%rd27];
add.f32 %f500, %f500, %f335;
$L__BB0_106:
mov.b32 %r559, %f500;
$L__BB0_107:
bar.sync 0;
st.shared.f32 [%rd25], %f489;
bar.sync 0;
@%p45 bra $L__BB0_109;
ld.shared.f32 %f336, [%rd26];
ld.shared.f32 %f337, [%rd25];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd25], %f338;
$L__BB0_109:
bar.sync 0;
@%p46 bra $L__BB0_114;
mov.u32 %r560, %r572;
$L__BB0_111:
setp.ge.u32 %p77, %r7, %r560;
@%p77 bra $L__BB0_113;
mad.lo.s32 %r374, %r560, %r2, %r39;
mul.wide.s32 %rd117, %r374, 4;
add.s64 %rd119, %rd50, %rd117;
ld.shared.f32 %f339, [%rd25];
ld.shared.f32 %f340, [%rd119];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd25], %f341;
$L__BB0_113:
bar.sync 0;
shr.u32 %r63, %r560, 1;
setp.gt.u32 %p78, %r560, 3;
mov.u32 %r560, %r63;
@%p78 bra $L__BB0_111;
$L__BB0_114:
mov.u32 %r561, 0;
@%p49 bra $L__BB0_118;
setp.lt.u32 %p80, %r3, 2;
ld.shared.f32 %f342, [%rd25];
add.f32 %f501, %f342, 0f00000000;
@%p80 bra $L__BB0_117;
ld.shared.f32 %f343, [%rd27];
add.f32 %f501, %f501, %f343;
$L__BB0_117:
mov.b32 %r561, %f501;
$L__BB0_118:
bar.sync 0;
st.shared.f32 [%rd25], %f490;
bar.sync 0;
@%p45 bra $L__BB0_120;
ld.shared.f32 %f344, [%rd26];
ld.shared.f32 %f345, [%rd25];
add.f32 %f346, %f344, %f345;
st.shared.f32 [%rd25], %f346;
$L__BB0_120:
bar.sync 0;
@%p46 bra $L__BB0_125;
mov.u32 %r562, %r572;
$L__BB0_122:
setp.ge.u32 %p83, %r7, %r562;
@%p83 bra $L__BB0_124;
mad.lo.s32 %r376, %r562, %r2, %r39;
mul.wide.s32 %rd120, %r376, 4;
add.s64 %rd122, %rd50, %rd120;
ld.shared.f32 %f347, [%rd25];
ld.shared.f32 %f348, [%rd122];
add.f32 %f349, %f348, %f347;
st.shared.f32 [%rd25], %f349;
$L__BB0_124:
bar.sync 0;
shr.u32 %r67, %r562, 1;
setp.gt.u32 %p84, %r562, 3;
mov.u32 %r562, %r67;
@%p84 bra $L__BB0_122;
$L__BB0_125:
mov.u32 %r563, 0;
@%p49 bra $L__BB0_129;
setp.lt.u32 %p86, %r3, 2;
ld.shared.f32 %f350, [%rd25];
add.f32 %f502, %f350, 0f00000000;
@%p86 bra $L__BB0_128;
ld.shared.f32 %f351, [%rd27];
add.f32 %f502, %f502, %f351;
$L__BB0_128:
mov.b32 %r563, %f502;
$L__BB0_129:
bar.sync 0;
st.shared.f32 [%rd25], %f491;
bar.sync 0;
@%p45 bra $L__BB0_131;
ld.shared.f32 %f352, [%rd26];
ld.shared.f32 %f353, [%rd25];
add.f32 %f354, %f352, %f353;
st.shared.f32 [%rd25], %f354;
$L__BB0_131:
bar.sync 0;
@%p46 bra $L__BB0_136;
mov.u32 %r564, %r572;
$L__BB0_133:
setp.ge.u32 %p89, %r7, %r564;
@%p89 bra $L__BB0_135;
mad.lo.s32 %r378, %r564, %r2, %r39;
mul.wide.s32 %rd123, %r378, 4;
add.s64 %rd125, %rd50, %rd123;
ld.shared.f32 %f355, [%rd25];
ld.shared.f32 %f356, [%rd125];
add.f32 %f357, %f356, %f355;
st.shared.f32 [%rd25], %f357;
$L__BB0_135:
bar.sync 0;
shr.u32 %r71, %r564, 1;
setp.gt.u32 %p90, %r564, 3;
mov.u32 %r564, %r71;
@%p90 bra $L__BB0_133;
$L__BB0_136:
mov.u32 %r565, 0;
@%p49 bra $L__BB0_140;
setp.lt.u32 %p92, %r3, 2;
ld.shared.f32 %f358, [%rd25];
add.f32 %f503, %f358, 0f00000000;
@%p92 bra $L__BB0_139;
ld.shared.f32 %f359, [%rd27];
add.f32 %f503, %f503, %f359;
$L__BB0_139:
mov.b32 %r565, %f503;
$L__BB0_140:
bar.sync 0;
st.shared.f32 [%rd25], %f492;
bar.sync 0;
@%p45 bra $L__BB0_142;
ld.shared.f32 %f360, [%rd26];
ld.shared.f32 %f361, [%rd25];
add.f32 %f362, %f360, %f361;
st.shared.f32 [%rd25], %f362;
$L__BB0_142:
bar.sync 0;
@%p46 bra $L__BB0_147;
mov.u32 %r566, %r572;
$L__BB0_144:
setp.ge.u32 %p95, %r7, %r566;
@%p95 bra $L__BB0_146;
mad.lo.s32 %r380, %r566, %r2, %r39;
mul.wide.s32 %rd126, %r380, 4;
add.s64 %rd128, %rd50, %rd126;
ld.shared.f32 %f363, [%rd25];
ld.shared.f32 %f364, [%rd128];
add.f32 %f365, %f364, %f363;
st.shared.f32 [%rd25], %f365;
$L__BB0_146:
bar.sync 0;
shr.u32 %r75, %r566, 1;
setp.gt.u32 %p96, %r566, 3;
mov.u32 %r566, %r75;
@%p96 bra $L__BB0_144;
$L__BB0_147:
mov.u32 %r567, 0;
@%p49 bra $L__BB0_151;
setp.lt.u32 %p98, %r3, 2;
ld.shared.f32 %f366, [%rd25];
add.f32 %f504, %f366, 0f00000000;
@%p98 bra $L__BB0_150;
ld.shared.f32 %f367, [%rd27];
add.f32 %f504, %f504, %f367;
$L__BB0_150:
mov.b32 %r567, %f504;
$L__BB0_151:
bar.sync 0;
st.shared.f32 [%rd25], %f493;
bar.sync 0;
@%p45 bra $L__BB0_153;
ld.shared.f32 %f368, [%rd26];
ld.shared.f32 %f369, [%rd25];
add.f32 %f370, %f368, %f369;
st.shared.f32 [%rd25], %f370;
$L__BB0_153:
bar.sync 0;
@%p46 bra $L__BB0_158;
mov.u32 %r568, %r572;
$L__BB0_155:
setp.ge.u32 %p101, %r7, %r568;
@%p101 bra $L__BB0_157;
mad.lo.s32 %r382, %r568, %r2, %r39;
mul.wide.s32 %rd129, %r382, 4;
add.s64 %rd131, %rd50, %rd129;
ld.shared.f32 %f371, [%rd25];
ld.shared.f32 %f372, [%rd131];
add.f32 %f373, %f372, %f371;
st.shared.f32 [%rd25], %f373;
$L__BB0_157:
bar.sync 0;
shr.u32 %r79, %r568, 1;
setp.gt.u32 %p102, %r568, 3;
mov.u32 %r568, %r79;
@%p102 bra $L__BB0_155;
$L__BB0_158:
mov.u32 %r569, 0;
@%p49 bra $L__BB0_162;
setp.lt.u32 %p104, %r3, 2;
ld.shared.f32 %f374, [%rd25];
add.f32 %f505, %f374, 0f00000000;
@%p104 bra $L__BB0_161;
ld.shared.f32 %f375, [%rd27];
add.f32 %f505, %f505, %f375;
$L__BB0_161:
mov.b32 %r569, %f505;
$L__BB0_162:
bar.sync 0;
st.shared.f32 [%rd25], %f494;
bar.sync 0;
@%p45 bra $L__BB0_164;
ld.shared.f32 %f376, [%rd26];
ld.shared.f32 %f377, [%rd25];
add.f32 %f378, %f376, %f377;
st.shared.f32 [%rd25], %f378;
$L__BB0_164:
bar.sync 0;
@%p46 bra $L__BB0_169;
mov.u32 %r570, %r572;
$L__BB0_166:
setp.ge.u32 %p107, %r7, %r570;
@%p107 bra $L__BB0_168;
mad.lo.s32 %r384, %r570, %r2, %r39;
mul.wide.s32 %rd132, %r384, 4;
add.s64 %rd134, %rd50, %rd132;
ld.shared.f32 %f379, [%rd25];
ld.shared.f32 %f380, [%rd134];
add.f32 %f381, %f380, %f379;
st.shared.f32 [%rd25], %f381;
$L__BB0_168:
bar.sync 0;
shr.u32 %r83, %r570, 1;
setp.gt.u32 %p108, %r570, 3;
mov.u32 %r570, %r83;
@%p108 bra $L__BB0_166;
$L__BB0_169:
mov.u32 %r571, 0;
@%p49 bra $L__BB0_173;
setp.lt.u32 %p110, %r3, 2;
ld.shared.f32 %f382, [%rd25];
add.f32 %f506, %f382, 0f00000000;
@%p110 bra $L__BB0_172;
ld.shared.f32 %f383, [%rd27];
add.f32 %f506, %f506, %f383;
$L__BB0_172:
mov.b32 %r571, %f506;
$L__BB0_173:
bar.sync 0;
st.shared.f32 [%rd25], %f495;
bar.sync 0;
@%p45 bra $L__BB0_175;
ld.shared.f32 %f384, [%rd26];
ld.shared.f32 %f385, [%rd25];
add.f32 %f386, %f384, %f385;
st.shared.f32 [%rd25], %f386;
$L__BB0_175:
bar.sync 0;
@%p46 bra $L__BB0_179;
$L__BB0_176:
setp.ge.u32 %p113, %r7, %r572;
@%p113 bra $L__BB0_178;
mad.lo.s32 %r386, %r572, %r2, %r39;
mul.wide.s32 %rd135, %r386, 4;
add.s64 %rd137, %rd50, %rd135;
ld.shared.f32 %f387, [%rd25];
ld.shared.f32 %f388, [%rd137];
add.f32 %f389, %f388, %f387;
st.shared.f32 [%rd25], %f389;
$L__BB0_178:
bar.sync 0;
shr.u32 %r87, %r572, 1;
setp.gt.u32 %p114, %r572, 3;
mov.u32 %r572, %r87;
@%p114 bra $L__BB0_176;
$L__BB0_179:
mov.u32 %r573, 0;
@%p49 bra $L__BB0_183;
setp.lt.u32 %p116, %r3, 2;
ld.shared.f32 %f390, [%rd25];
add.f32 %f507, %f390, 0f00000000;
@%p116 bra $L__BB0_182;
ld.shared.f32 %f391, [%rd27];
add.f32 %f507, %f507, %f391;
$L__BB0_182:
mov.b32 %r573, %f507;
$L__BB0_183:
setp.eq.s32 %p186, %r7, 0;
and.pred %p185, %p186, %p1;
bar.sync 0;
@%p185 bra $L__BB0_184;
bra.uni $L__BB0_185;
$L__BB0_184:
shl.b32 %r538, %r5, 2;
mov.u32 %r400, %ctaid.y;
mad.lo.s32 %r401, %r211, %r400, %r538;
mul.wide.s32 %rd141, %r401, 4;
add.s64 %rd138, %rd46, %rd141;
// begin inline asm
st.volatile.global.v4.s32 [%rd138], {%r551,%r553,%r555,%r557};
// end inline asm
add.s64 %rd139, %rd47, %rd141;
// begin inline asm
st.volatile.global.v4.s32 [%rd139], {%r559,%r561,%r563,%r565};
// end inline asm
add.s64 %rd140, %rd48, %rd141;
// begin inline asm
st.volatile.global.v4.s32 [%rd140], {%r567,%r569,%r571,%r573};
// end inline asm
$L__BB0_185:
mov.u32 %r90, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r402, %r5, %r7;
or.b32 %r404, %r402, %r352;
setp.ne.s32 %p117, %r404, 0;
@%p117 bra $L__BB0_189;
ld.param.u64 %rd188, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_14];
cvta.to.global.u64 %rd142, %rd188;
mov.u32 %r405, %ctaid.x;
mov.u32 %r406, %ctaid.z;
mov.u32 %r407, %nctaid.x;
mad.lo.s32 %r408, %r406, %r407, %r405;
mul.wide.s32 %rd143, %r408, 8;
add.s64 %rd31, %rd142, %rd143;
add.s32 %r409, %r8, -1;
setp.eq.s32 %p118, %r90, %r409;
cvt.s64.s32 %rd144, %r8;
mov.u64 %rd145, -9223372036854775807;
sub.s64 %rd146, %rd145, %rd144;
selp.b64 %rd147, %rd146, 1, %p118;
atom.global.add.u64 %rd32, [%rd31], %rd147;
ld.volatile.global.u64 %rd148, [%rd31];
xor.b64 %rd149, %rd148, %rd32;
setp.lt.s64 %p119, %rd149, 0;
@%p119 bra $L__BB0_189;
mov.u32 %r574, 8;
$L__BB0_188:
// begin inline asm
nanosleep.u32 %r574;
// end inline asm
setp.lt.u32 %p120, %r574, 256;
selp.u32 %r412, 1, 0, %p120;
shl.b32 %r574, %r574, %r412;
ld.volatile.global.u64 %rd150, [%rd31];
xor.b64 %rd151, %rd150, %rd32;
setp.gt.s64 %p121, %rd151, -1;
@%p121 bra $L__BB0_188;
$L__BB0_189:
bar.sync 0;
add.s32 %r413, %r8, %r2;
add.s32 %r414, %r413, -1;
div.s32 %r93, %r414, %r2;
setp.lt.s32 %p122, %r93, 1;
mov.f32 %f510, 0f00000000;
mov.f32 %f511, %f510;
@%p122 bra $L__BB0_195;
add.s32 %r416, %r211, 1;
shr.u32 %r417, %r416, 31;
add.s32 %r418, %r416, %r417;
shr.s32 %r419, %r418, 1;
add.s32 %r420, %r3, %r419;
add.s32 %r421, %r420, -1;
shl.b32 %r422, %r7, 1;
shl.b32 %r423, %r3, 1;
mad.lo.s32 %r424, %r423, %r90, %r422;
or.b32 %r425, %r424, 1;
setp.ge.s32 %p123, %r425, %r211;
div.s32 %r426, %r421, %r3;
setp.ge.s32 %p124, %r90, %r426;
or.pred %p6, %p124, %p123;
mul.lo.s32 %r427, %r3, %r90;
shl.b32 %r428, %r427, 1;
mad.lo.s32 %r429, %r211, %r5, %r428;
add.s32 %r576, %r429, %r422;
mul.lo.s32 %r95, %r211, %r2;
mov.u32 %r415, 0;
mov.f32 %f510, 0f00000000;
mov.u32 %r575, %r5;
mov.u32 %r577, %r415;
$L__BB0_191:
.pragma "nounroll";
mov.u32 %r578, %r415;
mov.u32 %r579, %r415;
@%p6 bra $L__BB0_194;
setp.ge.s32 %p125, %r575, %r8;
mov.u32 %r578, %r415;
mov.u32 %r579, %r415;
@%p125 bra $L__BB0_194;
mul.wide.s32 %rd153, %r576, 4;
add.s64 %rd152, %rd48, %rd153;
// begin inline asm
ld.volatile.global.v2.s32 {%r579,%r578}, [%rd152];
// end inline asm
$L__BB0_194:
mov.b32 %f396, %r579;
add.f32 %f510, %f510, %f396;
mov.b32 %f397, %r578;
add.f32 %f511, %f511, %f397;
add.s32 %r576, %r576, %r95;
add.s32 %r575, %r575, %r2;
add.s32 %r577, %r577, 1;
setp.lt.s32 %p126, %r577, %r93;
@%p126 bra $L__BB0_191;
$L__BB0_195:
clz.b32 %r436, %r2;
mov.u32 %r437, 31;
sub.s32 %r438, %r437, %r436;
mov.u32 %r439, 1;
shl.b32 %r106, %r439, %r438;
setp.lt.u32 %p127, %r5, %r106;
add.s32 %r440, %r106, %r5;
setp.lt.u32 %p128, %r440, %r2;
and.pred %p7, %p127, %p128;
add.s32 %r441, %r39, %r106;
mul.wide.s32 %rd154, %r441, 4;
add.s64 %rd33, %rd50, %rd154;
shr.u32 %r442, %r106, 31;
add.s32 %r443, %r106, %r442;
shr.s32 %r600, %r443, 1;
st.shared.f32 [%rd25], %f510;
bar.sync 0;
not.pred %p129, %p7;
@%p129 bra $L__BB0_197;
ld.shared.f32 %f398, [%rd33];
ld.shared.f32 %f399, [%rd25];
add.f32 %f400, %f398, %f399;
st.shared.f32 [%rd25], %f400;
$L__BB0_197:
setp.lt.s32 %p130, %r106, 4;
bar.sync 0;
@%p130 bra $L__BB0_202;
mov.u32 %r580, %r600;
$L__BB0_199:
setp.ge.u32 %p131, %r5, %r580;
@%p131 bra $L__BB0_201;
add.s32 %r444, %r580, %r39;
mul.wide.s32 %rd156, %r444, 4;
add.s64 %rd158, %rd50, %rd156;
ld.shared.f32 %f401, [%rd25];
ld.shared.f32 %f402, [%rd158];
add.f32 %f403, %f402, %f401;
st.shared.f32 [%rd25], %f403;
$L__BB0_201:
bar.sync 0;
shr.u32 %r109, %r580, 1;
setp.gt.u32 %p132, %r580, 3;
mov.u32 %r580, %r109;
@%p132 bra $L__BB0_199;
$L__BB0_202:
add.s32 %r446, %r39, 1;
mul.wide.u32 %rd159, %r446, 4;
add.s64 %rd34, %rd50, %rd159;
setp.ne.s32 %p133, %r5, 0;
mov.u32 %r581, 0;
@%p133 bra $L__BB0_206;
setp.lt.u32 %p134, %r2, 2;
ld.shared.f32 %f404, [%rd25];
add.f32 %f512, %f404, 0f00000000;
@%p134 bra $L__BB0_205;
ld.shared.f32 %f405, [%rd34];
add.f32 %f512, %f512, %f405;
$L__BB0_205:
mov.b32 %r581, %f512;
$L__BB0_206:
bar.sync 0;
st.shared.f32 [%rd25], %f511;
bar.sync 0;
@%p129 bra $L__BB0_208;
ld.shared.f32 %f406, [%rd33];
ld.shared.f32 %f407, [%rd25];
add.f32 %f408, %f406, %f407;
st.shared.f32 [%rd25], %f408;
$L__BB0_208:
bar.sync 0;
@%p130 bra $L__BB0_213;
mov.u32 %r582, %r600;
$L__BB0_210:
setp.ge.u32 %p137, %r5, %r582;
@%p137 bra $L__BB0_212;
add.s32 %r447, %r582, %r39;
mul.wide.s32 %rd161, %r447, 4;
add.s64 %rd163, %rd50, %rd161;
ld.shared.f32 %f409, [%rd25];
ld.shared.f32 %f410, [%rd163];
add.f32 %f411, %f410, %f409;
st.shared.f32 [%rd25], %f411;
$L__BB0_212:
bar.sync 0;
shr.u32 %r113, %r582, 1;
setp.gt.u32 %p138, %r582, 3;
mov.u32 %r582, %r113;
@%p138 bra $L__BB0_210;
$L__BB0_213:
mov.u32 %r583, 0;
@%p133 bra $L__BB0_217;
setp.lt.u32 %p140, %r2, 2;
ld.shared.f32 %f412, [%rd25];
add.f32 %f513, %f412, 0f00000000;
@%p140 bra $L__BB0_216;
ld.shared.f32 %f413, [%rd34];
add.f32 %f513, %f513, %f413;
$L__BB0_216:
mov.b32 %r583, %f513;
$L__BB0_217:
bar.sync 0;
@%p133 bra $L__BB0_221;
add.s32 %r449, %r211, 1;
shr.u32 %r450, %r449, 31;
add.s32 %r451, %r449, %r450;
shr.s32 %r452, %r451, 1;
add.s32 %r453, %r3, %r452;
add.s32 %r454, %r453, -1;
div.s32 %r455, %r454, %r3;
setp.ge.s32 %p142, %r90, %r455;
@%p142 bra $L__BB0_221;
shl.b32 %r116, %r7, 1;
mul.lo.s32 %r456, %r3, %r90;
shl.b32 %r117, %r456, 1;
add.s32 %r457, %r116, %r117;
or.b32 %r458, %r457, 1;
setp.ge.s32 %p143, %r458, %r211;
@%p143 bra $L__BB0_221;
ld.param.u64 %rd187, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10];
add.s32 %r461, %r117, %r116;
mul.wide.s32 %rd165, %r461, 4;
add.s64 %rd164, %rd187, %rd165;
// begin inline asm
st.global.cs.v2.s32 [%rd164], {%r581,%r583};
// end inline asm
$L__BB0_221:
mov.f32 %f516, 0f00000000;
mov.f32 %f517, %f516;
@%p122 bra $L__BB0_227;
add.s32 %r463, %r211, 1;
shr.u32 %r464, %r463, 31;
add.s32 %r465, %r463, %r464;
shr.s32 %r466, %r465, 1;
add.s32 %r467, %r3, %r466;
add.s32 %r468, %r467, -1;
shl.b32 %r469, %r7, 1;
shl.b32 %r470, %r3, 1;
mad.lo.s32 %r471, %r470, %r90, %r469;
or.b32 %r472, %r471, 1;
setp.ge.s32 %p145, %r472, %r211;
div.s32 %r473, %r468, %r3;
setp.ge.s32 %p146, %r90, %r473;
or.pred %p8, %p146, %p145;
mul.lo.s32 %r474, %r3, %r90;
shl.b32 %r475, %r474, 1;
mad.lo.s32 %r476, %r211, %r5, %r475;
add.s32 %r585, %r476, %r469;
mul.lo.s32 %r119, %r211, %r2;
mov.u32 %r462, 0;
mov.f32 %f516, 0f00000000;
mov.u32 %r584, %r5;
mov.u32 %r586, %r462;
$L__BB0_223:
.pragma "nounroll";
mov.u32 %r587, %r462;
mov.u32 %r588, %r462;
@%p8 bra $L__BB0_226;
setp.ge.s32 %p147, %r584, %r8;
mov.u32 %r587, %r462;
mov.u32 %r588, %r462;
@%p147 bra $L__BB0_226;
mul.wide.s32 %rd167, %r585, 4;
add.s64 %rd166, %rd47, %rd167;
// begin inline asm
ld.volatile.global.v2.s32 {%r588,%r587}, [%rd166];
// end inline asm
$L__BB0_226:
mov.b32 %f418, %r588;
add.f32 %f516, %f516, %f418;
mov.b32 %f419, %r587;
add.f32 %f517, %f517, %f419;
add.s32 %r585, %r585, %r119;
add.s32 %r584, %r584, %r2;
add.s32 %r586, %r586, 1;
setp.lt.s32 %p148, %r586, %r93;
@%p148 bra $L__BB0_223;
$L__BB0_227:
st.shared.f32 [%rd25], %f516;
bar.sync 0;
@%p129 bra $L__BB0_229;
ld.shared.f32 %f420, [%rd33];
ld.shared.f32 %f421, [%rd25];
add.f32 %f422, %f420, %f421;
st.shared.f32 [%rd25], %f422;
$L__BB0_229:
bar.sync 0;
@%p130 bra $L__BB0_234;
mov.u32 %r589, %r600;
$L__BB0_231:
setp.ge.u32 %p151, %r5, %r589;
@%p151 bra $L__BB0_233;
add.s32 %r483, %r589, %r39;
mul.wide.s32 %rd168, %r483, 4;
add.s64 %rd170, %rd50, %rd168;
ld.shared.f32 %f423, [%rd25];
ld.shared.f32 %f424, [%rd170];
add.f32 %f425, %f424, %f423;
st.shared.f32 [%rd25], %f425;
$L__BB0_233:
bar.sync 0;
shr.u32 %r131, %r589, 1;
setp.gt.u32 %p152, %r589, 3;
mov.u32 %r589, %r131;
@%p152 bra $L__BB0_231;
$L__BB0_234:
mov.u32 %r590, 0;
@%p133 bra $L__BB0_238;
setp.lt.u32 %p154, %r2, 2;
ld.shared.f32 %f426, [%rd25];
add.f32 %f518, %f426, 0f00000000;
@%p154 bra $L__BB0_237;
ld.shared.f32 %f427, [%rd34];
add.f32 %f518, %f518, %f427;
$L__BB0_237:
mov.b32 %r590, %f518;
$L__BB0_238:
bar.sync 0;
st.shared.f32 [%rd25], %f517;
bar.sync 0;
@%p129 bra $L__BB0_240;
ld.shared.f32 %f428, [%rd33];
ld.shared.f32 %f429, [%rd25];
add.f32 %f430, %f428, %f429;
st.shared.f32 [%rd25], %f430;
$L__BB0_240:
bar.sync 0;
@%p130 bra $L__BB0_245;
mov.u32 %r591, %r600;
$L__BB0_242:
setp.ge.u32 %p157, %r5, %r591;
@%p157 bra $L__BB0_244;
add.s32 %r485, %r591, %r39;
mul.wide.s32 %rd171, %r485, 4;
add.s64 %rd173, %rd50, %rd171;
ld.shared.f32 %f431, [%rd25];
ld.shared.f32 %f432, [%rd173];
add.f32 %f433, %f432, %f431;
st.shared.f32 [%rd25], %f433;
$L__BB0_244:
bar.sync 0;
shr.u32 %r135, %r591, 1;
setp.gt.u32 %p158, %r591, 3;
mov.u32 %r591, %r135;
@%p158 bra $L__BB0_242;
$L__BB0_245:
mov.u32 %r592, 0;
@%p133 bra $L__BB0_249;
setp.lt.u32 %p160, %r2, 2;
ld.shared.f32 %f434, [%rd25];
add.f32 %f519, %f434, 0f00000000;
@%p160 bra $L__BB0_248;
ld.shared.f32 %f435, [%rd34];
add.f32 %f519, %f519, %f435;
$L__BB0_248:
mov.b32 %r592, %f519;
$L__BB0_249:
bar.sync 0;
@%p133 bra $L__BB0_253;
add.s32 %r487, %r211, 1;
shr.u32 %r488, %r487, 31;
add.s32 %r489, %r487, %r488;
shr.s32 %r490, %r489, 1;
add.s32 %r491, %r3, %r490;
add.s32 %r492, %r491, -1;
div.s32 %r493, %r492, %r3;
setp.ge.s32 %p162, %r90, %r493;
@%p162 bra $L__BB0_253;
shl.b32 %r138, %r7, 1;
mul.lo.s32 %r494, %r3, %r90;
shl.b32 %r139, %r494, 1;
add.s32 %r495, %r138, %r139;
or.b32 %r496, %r495, 1;
setp.ge.s32 %p163, %r496, %r211;
@%p163 bra $L__BB0_253;
ld.param.u64 %rd186, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9];
add.s32 %r499, %r139, %r138;
mul.wide.s32 %rd175, %r499, 4;
add.s64 %rd174, %rd186, %rd175;
// begin inline asm
st.global.cs.v2.s32 [%rd174], {%r590,%r592};
// end inline asm
$L__BB0_253:
mov.f32 %f522, 0f00000000;
mov.f32 %f523, %f522;
@%p122 bra $L__BB0_259;
add.s32 %r501, %r211, 1;
shr.u32 %r502, %r501, 31;
add.s32 %r503, %r501, %r502;
shr.s32 %r504, %r503, 1;
add.s32 %r505, %r3, %r504;
add.s32 %r506, %r505, -1;
shl.b32 %r507, %r7, 1;
shl.b32 %r508, %r3, 1;
mad.lo.s32 %r509, %r508, %r90, %r507;
or.b32 %r510, %r509, 1;
setp.ge.s32 %p165, %r510, %r211;
div.s32 %r511, %r506, %r3;
setp.ge.s32 %p166, %r90, %r511;
or.pred %p9, %p166, %p165;
mul.lo.s32 %r512, %r3, %r90;
shl.b32 %r513, %r512, 1;
mad.lo.s32 %r514, %r211, %r5, %r513;
add.s32 %r594, %r514, %r507;
mul.lo.s32 %r141, %r211, %r2;
mov.u32 %r500, 0;
mov.f32 %f522, 0f00000000;
mov.u32 %r593, %r5;
mov.u32 %r595, %r500;
$L__BB0_255:
.pragma "nounroll";
mov.u32 %r596, %r500;
mov.u32 %r597, %r500;
@%p9 bra $L__BB0_258;
setp.ge.s32 %p167, %r593, %r8;
mov.u32 %r596, %r500;
mov.u32 %r597, %r500;
@%p167 bra $L__BB0_258;
mul.wide.s32 %rd177, %r594, 4;
add.s64 %rd176, %rd46, %rd177;
// begin inline asm
ld.volatile.global.v2.s32 {%r597,%r596}, [%rd176];
// end inline asm
$L__BB0_258:
mov.b32 %f440, %r597;
add.f32 %f522, %f522, %f440;
mov.b32 %f441, %r596;
add.f32 %f523, %f523, %f441;
add.s32 %r594, %r594, %r141;
add.s32 %r593, %r593, %r2;
add.s32 %r595, %r595, 1;
setp.lt.s32 %p168, %r595, %r93;
@%p168 bra $L__BB0_255;
$L__BB0_259:
st.shared.f32 [%rd25], %f522;
bar.sync 0;
@%p129 bra $L__BB0_261;
ld.shared.f32 %f442, [%rd33];
ld.shared.f32 %f443, [%rd25];
add.f32 %f444, %f442, %f443;
st.shared.f32 [%rd25], %f444;
$L__BB0_261:
bar.sync 0;
@%p130 bra $L__BB0_266;
mov.u32 %r598, %r600;
$L__BB0_263:
setp.ge.u32 %p171, %r5, %r598;
@%p171 bra $L__BB0_265;
add.s32 %r521, %r598, %r39;
mul.wide.s32 %rd178, %r521, 4;
add.s64 %rd180, %rd50, %rd178;
ld.shared.f32 %f445, [%rd25];
ld.shared.f32 %f446, [%rd180];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd25], %f447;
$L__BB0_265:
bar.sync 0;
shr.u32 %r153, %r598, 1;
setp.gt.u32 %p172, %r598, 3;
mov.u32 %r598, %r153;
@%p172 bra $L__BB0_263;
$L__BB0_266:
mov.u32 %r599, 0;
@%p133 bra $L__BB0_270;
setp.lt.u32 %p174, %r2, 2;
ld.shared.f32 %f448, [%rd25];
add.f32 %f524, %f448, 0f00000000;
@%p174 bra $L__BB0_269;
ld.shared.f32 %f449, [%rd34];
add.f32 %f524, %f524, %f449;
$L__BB0_269:
mov.b32 %r599, %f524;
$L__BB0_270:
bar.sync 0;
st.shared.f32 [%rd25], %f523;
bar.sync 0;
@%p129 bra $L__BB0_272;
ld.shared.f32 %f450, [%rd33];
ld.shared.f32 %f451, [%rd25];
add.f32 %f452, %f450, %f451;
st.shared.f32 [%rd25], %f452;
$L__BB0_272:
bar.sync 0;
@%p130 bra $L__BB0_276;
$L__BB0_273:
setp.ge.u32 %p177, %r5, %r600;
@%p177 bra $L__BB0_275;
add.s32 %r523, %r600, %r39;
mul.wide.s32 %rd181, %r523, 4;
add.s64 %rd183, %rd50, %rd181;
ld.shared.f32 %f453, [%rd25];
ld.shared.f32 %f454, [%rd183];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd25], %f455;
$L__BB0_275:
bar.sync 0;
shr.u32 %r157, %r600, 1;
setp.gt.u32 %p178, %r600, 3;
mov.u32 %r600, %r157;
@%p178 bra $L__BB0_273;
$L__BB0_276:
mov.u32 %r601, 0;
@%p133 bra $L__BB0_280;
setp.lt.u32 %p180, %r2, 2;
ld.shared.f32 %f456, [%rd25];
add.f32 %f525, %f456, 0f00000000;
@%p180 bra $L__BB0_279;
ld.shared.f32 %f457, [%rd34];
add.f32 %f525, %f525, %f457;
$L__BB0_279:
mov.b32 %r601, %f525;
$L__BB0_280:
bar.sync 0;
@%p133 bra $L__BB0_284;
add.s32 %r525, %r211, 1;
shr.u32 %r526, %r525, 31;
add.s32 %r527, %r525, %r526;
shr.s32 %r528, %r527, 1;
add.s32 %r529, %r3, %r528;
add.s32 %r530, %r529, -1;
div.s32 %r531, %r530, %r3;
setp.ge.s32 %p182, %r90, %r531;
@%p182 bra $L__BB0_284;
shl.b32 %r160, %r7, 1;
mul.lo.s32 %r532, %r3, %r90;
shl.b32 %r161, %r532, 1;
add.s32 %r533, %r160, %r161;
or.b32 %r534, %r533, 1;
setp.ge.s32 %p183, %r534, %r211;
@%p183 bra $L__BB0_284;
ld.param.u64 %rd189, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_0ab5d412_1033910nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r537, %r161, %r160;
mul.wide.s32 %rd185, %r537, 4;
add.s64 %rd184, %rd189, %rd185;
// begin inline asm
st.global.cs.v2.s32 [%rd184], {%r599,%r601};
// end inline asm
$L__BB0_284:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_12[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_13[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_14[16]
)
{
.reg .pred %p<190>;
.reg .f32 %f<526>;
.reg .b32 %r<602>;
.reg .f64 %fd<3>;
.reg .b64 %rd<190>;
ld.param.v2.u32 {%r210, %r211}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r220, %r221}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r224, %r225}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3+16];
ld.param.u64 %rd48, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_13];
ld.param.u64 %rd47, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_12];
ld.param.u64 %rd46, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd41, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_6];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_5];
ld.param.u64 %rd39, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd38, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd37, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r258, %r211, 3;
shr.s32 %r259, %r258, 31;
shr.u32 %r260, %r259, 30;
add.s32 %r261, %r258, %r260;
shr.s32 %r2, %r261, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r262, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r263, %r4, 2;
mad.lo.s32 %r264, %r263, %r262, 15;
and.b32 %r265, %r264, -16;
cvt.u64.u32 %rd1, %r265;
mul.lo.s32 %r266, %r4, %r2;
shl.b32 %r267, %r266, 4;
or.b32 %r268, %r267, 15;
and.b32 %r5, %r268, -16;
add.s32 %r269, %r268, %r5;
and.b32 %r270, %r269, -16;
cvt.s64.s32 %rd2, %r270;
mov.u64 %rd50, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_72335arrayE;
cvta.shared.u64 %rd51, %rd50;
add.s64 %rd3, %rd51, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p10, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r271, %r7, 3;
setp.lt.s32 %p11, %r271, %r211;
and.pred %p1, %p11, %p10;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p12, %r8, 0;
and.pred %p2, %p12, %p1;
not.pred %p13, %p2;
@%p13 bra $L__BB0_2;
add.s64 %rd52, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd52; cvt.u32.u64 %r272, smem_ptr; }
// end inline asm
shl.b32 %r275, %r6, 4;
add.s32 %r273, %r272, %r275;
mul.wide.s32 %rd54, %r7, 4;
add.s64 %rd53, %rd39, %rd54;
mov.u32 %r274, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r274, 0;
cp.async.ca.shared.global [%r273], [%rd53], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r276, %r4, 63;
div.s32 %r277, %r276, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r278, %r9, %r277;
add.s32 %r279, %r278, -1;
div.s32 %r10, %r279, %r9;
setp.gt.s32 %p14, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p14 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r211;
cvt.s64.s32 %rd55, %r5;
add.s64 %rd56, %rd1, %rd55;
add.s64 %rd58, %rd50, %rd1;
mov.u32 %r281, %ctaid.y;
mul.lo.s32 %r282, %r10, %r4;
mul.lo.s32 %r11, %r282, %r281;
mad.lo.s32 %r283, %r2, %r8, %r6;
shl.b32 %r12, %r283, 4;
mul.lo.s32 %r284, %r211, %r8;
cvt.s64.s32 %rd59, %r284;
cvt.s64.s32 %rd60, %r7;
add.s64 %rd5, %rd59, %rd60;
mul.lo.s32 %r285, %r11, %r211;
cvt.s64.s32 %rd6, %r285;
mul.lo.s32 %r13, %r211, %r4;
mul.lo.s32 %r14, %r10, %r281;
shl.b32 %r286, %r8, 2;
mad.lo.s32 %r287, %r286, %r2, %r7;
add.s64 %rd61, %rd50, %rd56;
mul.wide.s32 %rd62, %r287, 4;
add.s64 %rd7, %rd61, %rd62;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r288, %tid.z;
mad.lo.s32 %r289, %r4, %r288, %r8;
mad.lo.s32 %r15, %r289, %r3, %r6;
mul.wide.u32 %rd63, %r15, 4;
add.s64 %rd8, %rd50, %rd63;
clz.b32 %r290, %r3;
mov.u32 %r291, 31;
sub.s32 %r292, %r291, %r290;
mov.u32 %r293, 1;
shl.b32 %r16, %r293, %r292;
setp.lt.u32 %p15, %r6, %r16;
add.s32 %r294, %r16, %r6;
setp.lt.u32 %p16, %r294, %r3;
and.pred %p3, %p15, %p16;
add.s32 %r295, %r15, %r16;
mul.wide.s32 %rd64, %r295, 4;
add.s64 %rd9, %rd50, %rd64;
shr.u32 %r296, %r16, 31;
add.s32 %r297, %r16, %r296;
shr.s32 %r17, %r297, 1;
add.s64 %rd10, %rd58, %rd62;
add.s32 %r298, %r15, 1;
mul.wide.u32 %rd65, %r298, 4;
add.s64 %rd11, %rd50, %rd65;
add.s64 %rd66, %rd50, %rd4;
mul.wide.s32 %rd67, %r7, 4;
add.s64 %rd12, %rd66, %rd67;
mul.wide.s32 %rd68, %r289, 4;
add.s64 %rd13, %rd50, %rd68;
add.s64 %rd14, %rd40, %rd67;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd17, %rd37;
cvta.to.global.u64 %rd18, %rd38;
add.s64 %rd21, %rd51, %rd56;
mov.u32 %r543, 0;
mov.f32 %f480, 0f00000000;
not.pred %p17, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd21; cvt.u32.u64 %r301, smem_ptr; }
// end inline asm
add.s32 %r302, %r301, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r327, smem_ptr; }
// end inline asm
add.s32 %r328, %r327, %r12;
not.pred %p27, %p3;
mov.f32 %f481, %f480;
mov.f32 %f482, %f480;
mov.f32 %f483, %f480;
mov.f32 %f488, %f480;
mov.f32 %f489, %f480;
mov.f32 %f490, %f480;
mov.f32 %f491, %f480;
mov.f32 %f492, %f480;
mov.f32 %f493, %f480;
mov.f32 %f494, %f480;
mov.f32 %f495, %f480;
$L__BB0_5:
.pragma "nounroll";
@%p17 bra $L__BB0_8;
mad.lo.s32 %r299, %r543, %r4, %r8;
add.s32 %r300, %r299, %r11;
setp.gt.s32 %p18, %r300, 63;
@%p18 bra $L__BB0_8;
mul.lo.s32 %r304, %r13, %r543;
cvt.s64.s32 %rd72, %r304;
add.s64 %rd73, %rd5, %rd72;
add.s64 %rd74, %rd73, %rd6;
shl.b64 %rd75, %rd74, 2;
add.s64 %rd71, %rd35, %rd75;
mov.u32 %r303, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r303, 0;
cp.async.ca.shared.global [%r302], [%rd71], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p17 bra $L__BB0_10;
add.s32 %r305, %r14, %r543;
mad.lo.s32 %r306, %r305, %r4, %r8;
setp.lt.s32 %p20, %r306, 64;
@%p20 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r544, %r545, %r546, %r547}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r315, %r14, %r543;
mad.lo.s32 %r316, %r315, %r4, %r8;
setp.gt.s32 %p21, %r316, 63;
mov.u32 %r544, 0;
mov.u32 %r545, %r544;
mov.u32 %r546, %r544;
mov.u32 %r547, %r544;
@%p21 bra $L__BB0_15;
ld.shared.v4.u32 {%r544, %r545, %r546, %r547}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r544, 0;
mov.u32 %r545, %r544;
mov.u32 %r546, %r544;
mov.u32 %r547, %r544;
$L__BB0_15:
add.s32 %r325, %r14, %r543;
mad.lo.s32 %r33, %r325, %r4, %r8;
mov.b32 %f160, %r547;
add.f32 %f495, %f495, %f160;
mov.b32 %f161, %r546;
add.f32 %f494, %f494, %f161;
mov.b32 %f162, %r545;
add.f32 %f493, %f493, %f162;
mov.b32 %f163, %r544;
add.f32 %f492, %f492, %f163;
setp.gt.s32 %p22, %r33, 63;
mov.f32 %f470, 0f00000000;
@%p22 bra $L__BB0_17;
mul.lo.s32 %r326, %r33, %r220;
mul.wide.s32 %rd76, %r326, 4;
add.s64 %rd77, %rd17, %rd76;
ld.global.f32 %f470, [%rd77];
$L__BB0_17:
add.s32 %r540, %r14, %r543;
mad.lo.s32 %r539, %r540, %r4, %r8;
setp.lt.s32 %p23, %r539, 64;
and.pred %p4, %p1, %p23;
not.pred %p24, %p4;
@%p24 bra $L__BB0_19;
mul.lo.s32 %r330, %r13, %r543;
cvt.s64.s32 %rd80, %r330;
add.s64 %rd81, %rd5, %rd80;
add.s64 %rd82, %rd81, %rd6;
shl.b64 %rd83, %rd82, 2;
add.s64 %rd79, %rd36, %rd83;
mov.u32 %r329, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r329, 0;
cp.async.ca.shared.global [%r328], [%rd79], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r542, %r14, %r543;
mad.lo.s32 %r541, %r542, %r4, %r8;
setp.gt.s32 %p189, %r541, 63;
mov.f32 %f476, 0f00000000;
mov.f32 %f471, %f476;
@%p189 bra $L__BB0_21;
mul.lo.s32 %r331, %r33, %r224;
mul.wide.s32 %rd84, %r331, 4;
add.s64 %rd85, %rd18, %rd84;
ld.global.f32 %f471, [%rd85];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f477, %f476;
@%p24 bra $L__BB0_23;
ld.shared.v4.f32 {%f167, %f168, %f169, %f170}, [%rd12];
ld.shared.v4.f32 {%f172, %f173, %f174, %f175}, [%rd7];
mul.f32 %f177, %f167, %f172;
add.f32 %f178, %f177, 0f00000000;
ld.shared.v4.f32 {%f179, %f180, %f181, %f182}, [%rd10];
sub.f32 %f184, %f179, %f470;
mul.f32 %f185, %f471, %f184;
fma.rn.f32 %f186, %f177, %f185, 0f00000000;
fma.rn.f32 %f488, %f185, %f172, %f488;
mul.f32 %f189, %f168, %f173;
add.f32 %f190, %f178, %f189;
sub.f32 %f192, %f180, %f470;
mul.f32 %f193, %f471, %f192;
fma.rn.f32 %f194, %f189, %f193, %f186;
fma.rn.f32 %f489, %f193, %f173, %f489;
mul.f32 %f197, %f169, %f174;
add.f32 %f198, %f190, %f197;
sub.f32 %f200, %f181, %f470;
mul.f32 %f201, %f471, %f200;
fma.rn.f32 %f202, %f197, %f201, %f194;
fma.rn.f32 %f490, %f201, %f174, %f490;
mul.f32 %f205, %f170, %f175;
add.f32 %f477, %f198, %f205;
sub.f32 %f207, %f182, %f470;
mul.f32 %f208, %f471, %f207;
fma.rn.f32 %f476, %f205, %f208, %f202;
fma.rn.f32 %f491, %f208, %f175, %f491;
$L__BB0_23:
st.shared.f32 [%rd8], %f477;
bar.sync 0;
@%p27 bra $L__BB0_25;
ld.shared.f32 %f209, [%rd9];
ld.shared.f32 %f210, [%rd8];
add.f32 %f211, %f209, %f210;
st.shared.f32 [%rd8], %f211;
$L__BB0_25:
setp.lt.s32 %p28, %r16, 4;
bar.sync 0;
@%p28 bra $L__BB0_30;
mov.u32 %r548, %r17;
$L__BB0_27:
setp.ge.u32 %p29, %r6, %r548;
@%p29 bra $L__BB0_29;
add.s32 %r332, %r548, %r15;
mul.wide.s32 %rd86, %r332, 4;
add.s64 %rd88, %rd50, %rd86;
ld.shared.f32 %f212, [%rd8];
ld.shared.f32 %f213, [%rd88];
add.f32 %f214, %f213, %f212;
st.shared.f32 [%rd8], %f214;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r548, 1;
setp.gt.u32 %p30, %r548, 3;
mov.u32 %r548, %r35;
@%p30 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p31, %r6, 0;
mov.f32 %f478, 0f00000000;
@%p31 bra $L__BB0_33;
setp.lt.u32 %p32, %r3, 2;
ld.shared.f32 %f216, [%rd8];
add.f32 %f478, %f216, 0f00000000;
@%p32 bra $L__BB0_33;
ld.shared.f32 %f217, [%rd11];
add.f32 %f478, %f478, %f217;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f476;
bar.sync 0;
@%p27 bra $L__BB0_35;
ld.shared.f32 %f218, [%rd9];
ld.shared.f32 %f219, [%rd8];
add.f32 %f220, %f218, %f219;
st.shared.f32 [%rd8], %f220;
$L__BB0_35:
setp.lt.s32 %p184, %r16, 4;
bar.sync 0;
@%p184 bra $L__BB0_40;
mov.u32 %r549, %r17;
$L__BB0_37:
setp.ge.u32 %p35, %r6, %r549;
@%p35 bra $L__BB0_39;
add.s32 %r333, %r549, %r15;
mul.wide.s32 %rd89, %r333, 4;
add.s64 %rd91, %rd50, %rd89;
ld.shared.f32 %f221, [%rd8];
ld.shared.f32 %f222, [%rd91];
add.f32 %f223, %f222, %f221;
st.shared.f32 [%rd8], %f223;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r549, 1;
setp.gt.u32 %p36, %r549, 3;
mov.u32 %r549, %r37;
@%p36 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f479, 0f00000000;
@%p31 bra $L__BB0_43;
setp.lt.u32 %p38, %r3, 2;
ld.shared.f32 %f225, [%rd8];
add.f32 %f479, %f225, 0f00000000;
@%p38 bra $L__BB0_43;
ld.shared.f32 %f226, [%rd11];
add.f32 %f479, %f479, %f226;
$L__BB0_43:
bar.sync 0;
@%p31 bra $L__BB0_45;
st.shared.f32 [%rd13], %f478;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f41, [%rd13];
bar.sync 0;
@%p31 bra $L__BB0_47;
st.shared.f32 [%rd13], %f479;
$L__BB0_47:
add.s32 %r538, %r14, %r543;
mad.lo.s32 %r537, %r538, %r4, %r8;
setp.lt.s32 %p186, %r537, 64;
and.pred %p185, %p1, %p186;
bar.sync 0;
ld.shared.f32 %f42, [%rd13];
bar.sync 0;
@%p185 bra $L__BB0_50;
bra.uni $L__BB0_48;
$L__BB0_50:
mul.f32 %f227, %f471, %f1;
// begin inline asm
ld.global.cs.v4.u32 {%r338,%r339,%r340,%r341}, [%rd14];
// end inline asm
mov.b32 %f228, %r338;
add.f32 %f229, %f470, %f228;
add.f32 %f480, %f480, %f229;
ld.shared.v4.f32 {%f230, %f231, %f232, %f233}, [%rd12];
ld.shared.v4.f32 {%f235, %f236, %f237, %f238}, [%rd7];
mul.f32 %f240, %f230, %f235;
mul.f32 %f241, %f240, %f2;
ld.shared.v4.f32 {%f242, %f243, %f244, %f245}, [%rd10];
sub.f32 %f247, %f242, %f470;
mul.f32 %f248, %f471, %f247;
sub.f32 %f249, %f241, %f41;
mul.f32 %f250, %f42, %f248;
sub.f32 %f251, %f249, %f250;
mul.f32 %f252, %f227, %f251;
add.f32 %f253, %f229, %f252;
mov.b32 %r342, %f253;
mov.b32 %r346, %f252;
mov.b32 %f254, %r339;
add.f32 %f255, %f470, %f254;
add.f32 %f481, %f481, %f255;
mul.f32 %f258, %f231, %f236;
mul.f32 %f259, %f258, %f2;
sub.f32 %f261, %f243, %f470;
mul.f32 %f262, %f471, %f261;
sub.f32 %f263, %f259, %f41;
mul.f32 %f264, %f42, %f262;
sub.f32 %f265, %f263, %f264;
mul.f32 %f266, %f227, %f265;
add.f32 %f267, %f255, %f266;
mov.b32 %r343, %f267;
mov.b32 %r347, %f266;
mov.b32 %f268, %r340;
add.f32 %f269, %f470, %f268;
add.f32 %f482, %f482, %f269;
mul.f32 %f272, %f232, %f237;
mul.f32 %f273, %f272, %f2;
sub.f32 %f275, %f244, %f470;
mul.f32 %f276, %f471, %f275;
sub.f32 %f277, %f273, %f41;
mul.f32 %f278, %f42, %f276;
sub.f32 %f279, %f277, %f278;
mul.f32 %f280, %f227, %f279;
add.f32 %f281, %f269, %f280;
mov.b32 %r344, %f281;
mov.b32 %r348, %f280;
mov.b32 %f282, %r341;
add.f32 %f283, %f470, %f282;
add.f32 %f483, %f483, %f283;
mul.f32 %f286, %f233, %f238;
mul.f32 %f287, %f286, %f2;
sub.f32 %f289, %f245, %f470;
mul.f32 %f290, %f471, %f289;
sub.f32 %f291, %f287, %f41;
mul.f32 %f292, %f42, %f290;
sub.f32 %f293, %f291, %f292;
mul.f32 %f294, %f227, %f293;
add.f32 %f295, %f283, %f294;
mov.b32 %r345, %f295;
mov.b32 %r349, %f294;
mad.lo.s32 %r350, %r33, %r211, %r7;
mul.wide.s32 %rd96, %r350, 4;
add.s64 %rd94, %rd41, %rd96;
// begin inline asm
st.global.cs.v4.s32 [%rd94], {%r342,%r343,%r344,%r345};
// end inline asm
add.s64 %rd95, %rd43, %rd96;
// begin inline asm
st.global.cs.v4.s32 [%rd95], {%r346,%r347,%r348,%r349};
// end inline asm
bra.uni $L__BB0_51;
$L__BB0_48:
@%p17 bra $L__BB0_51;
// begin inline asm
ld.global.cs.v4.u32 {%r334,%r335,%r336,%r337}, [%rd14];
// end inline asm
$L__BB0_51:
add.s32 %r543, %r543, 1;
setp.lt.s32 %p42, %r543, %r10;
@%p42 bra $L__BB0_5;
bra.uni $L__BB0_52;
$L__BB0_3:
mov.f32 %f480, 0f00000000;
mov.f32 %f481, %f480;
mov.f32 %f482, %f480;
mov.f32 %f483, %f480;
mov.f32 %f488, %f480;
mov.f32 %f489, %f480;
mov.f32 %f490, %f480;
mov.f32 %f491, %f480;
mov.f32 %f492, %f480;
mov.f32 %f493, %f480;
mov.f32 %f494, %f480;
mov.f32 %f495, %f480;
$L__BB0_52:
mov.u32 %r351, %tid.z;
mad.lo.s32 %r352, %r4, %r351, %r8;
mad.lo.s32 %r39, %r352, %r3, %r6;
mul.wide.u32 %rd97, %r39, 4;
add.s64 %rd25, %rd50, %rd97;
clz.b32 %r353, %r4;
mov.u32 %r354, 31;
sub.s32 %r355, %r354, %r353;
mov.u32 %r356, 1;
shl.b32 %r40, %r356, %r355;
setp.lt.u32 %p43, %r8, %r40;
add.s32 %r357, %r40, %r8;
setp.lt.u32 %p44, %r357, %r4;
and.pred %p5, %p43, %p44;
shl.b32 %r358, %r3, %r355;
add.s32 %r359, %r39, %r358;
mul.wide.s32 %rd99, %r359, 4;
add.s64 %rd26, %rd50, %rd99;
shr.u32 %r360, %r40, 31;
add.s32 %r361, %r40, %r360;
shr.s32 %r572, %r361, 1;
st.shared.f32 [%rd25], %f480;
bar.sync 0;
not.pred %p45, %p5;
@%p45 bra $L__BB0_54;
ld.shared.f32 %f296, [%rd26];
ld.shared.f32 %f297, [%rd25];
add.f32 %f298, %f296, %f297;
st.shared.f32 [%rd25], %f298;
$L__BB0_54:
setp.lt.s32 %p46, %r40, 4;
bar.sync 0;
@%p46 bra $L__BB0_59;
mov.u32 %r550, %r572;
$L__BB0_56:
setp.ge.u32 %p47, %r8, %r550;
@%p47 bra $L__BB0_58;
mad.lo.s32 %r362, %r550, %r3, %r39;
mul.wide.s32 %rd100, %r362, 4;
add.s64 %rd102, %rd50, %rd100;
ld.shared.f32 %f299, [%rd25];
ld.shared.f32 %f300, [%rd102];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd25], %f301;
$L__BB0_58:
bar.sync 0;
shr.u32 %r43, %r550, 1;
setp.gt.u32 %p48, %r550, 3;
mov.u32 %r550, %r43;
@%p48 bra $L__BB0_56;
$L__BB0_59:
add.s32 %r364, %r39, %r3;
mul.wide.u32 %rd103, %r364, 4;
add.s64 %rd27, %rd50, %rd103;
setp.ne.s32 %p49, %r8, 0;
mov.u32 %r551, 0;
@%p49 bra $L__BB0_63;
setp.lt.u32 %p50, %r4, 2;
ld.shared.f32 %f302, [%rd25];
add.f32 %f496, %f302, 0f00000000;
@%p50 bra $L__BB0_62;
ld.shared.f32 %f303, [%rd27];
add.f32 %f496, %f496, %f303;
$L__BB0_62:
mov.b32 %r551, %f496;
$L__BB0_63:
bar.sync 0;
st.shared.f32 [%rd25], %f481;
bar.sync 0;
@%p45 bra $L__BB0_65;
ld.shared.f32 %f304, [%rd26];
ld.shared.f32 %f305, [%rd25];
add.f32 %f306, %f304, %f305;
st.shared.f32 [%rd25], %f306;
$L__BB0_65:
bar.sync 0;
@%p46 bra $L__BB0_70;
mov.u32 %r552, %r572;
$L__BB0_67:
setp.ge.u32 %p53, %r8, %r552;
@%p53 bra $L__BB0_69;
mad.lo.s32 %r365, %r552, %r3, %r39;
mul.wide.s32 %rd105, %r365, 4;
add.s64 %rd107, %rd50, %rd105;
ld.shared.f32 %f307, [%rd25];
ld.shared.f32 %f308, [%rd107];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd25], %f309;
$L__BB0_69:
bar.sync 0;
shr.u32 %r47, %r552, 1;
setp.gt.u32 %p54, %r552, 3;
mov.u32 %r552, %r47;
@%p54 bra $L__BB0_67;
$L__BB0_70:
mov.u32 %r553, 0;
@%p49 bra $L__BB0_74;
setp.lt.u32 %p56, %r4, 2;
ld.shared.f32 %f310, [%rd25];
add.f32 %f497, %f310, 0f00000000;
@%p56 bra $L__BB0_73;
ld.shared.f32 %f311, [%rd27];
add.f32 %f497, %f497, %f311;
$L__BB0_73:
mov.b32 %r553, %f497;
$L__BB0_74:
bar.sync 0;
st.shared.f32 [%rd25], %f482;
bar.sync 0;
@%p45 bra $L__BB0_76;
ld.shared.f32 %f312, [%rd26];
ld.shared.f32 %f313, [%rd25];
add.f32 %f314, %f312, %f313;
st.shared.f32 [%rd25], %f314;
$L__BB0_76:
bar.sync 0;
@%p46 bra $L__BB0_81;
mov.u32 %r554, %r572;
$L__BB0_78:
setp.ge.u32 %p59, %r8, %r554;
@%p59 bra $L__BB0_80;
mad.lo.s32 %r367, %r554, %r3, %r39;
mul.wide.s32 %rd108, %r367, 4;
add.s64 %rd110, %rd50, %rd108;
ld.shared.f32 %f315, [%rd25];
ld.shared.f32 %f316, [%rd110];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd25], %f317;
$L__BB0_80:
bar.sync 0;
shr.u32 %r51, %r554, 1;
setp.gt.u32 %p60, %r554, 3;
mov.u32 %r554, %r51;
@%p60 bra $L__BB0_78;
$L__BB0_81:
mov.u32 %r555, 0;
@%p49 bra $L__BB0_85;
setp.lt.u32 %p62, %r4, 2;
ld.shared.f32 %f318, [%rd25];
add.f32 %f498, %f318, 0f00000000;
@%p62 bra $L__BB0_84;
ld.shared.f32 %f319, [%rd27];
add.f32 %f498, %f498, %f319;
$L__BB0_84:
mov.b32 %r555, %f498;
$L__BB0_85:
bar.sync 0;
st.shared.f32 [%rd25], %f483;
bar.sync 0;
@%p45 bra $L__BB0_87;
ld.shared.f32 %f320, [%rd26];
ld.shared.f32 %f321, [%rd25];
add.f32 %f322, %f320, %f321;
st.shared.f32 [%rd25], %f322;
$L__BB0_87:
bar.sync 0;
@%p46 bra $L__BB0_92;
mov.u32 %r556, %r572;
$L__BB0_89:
setp.ge.u32 %p65, %r8, %r556;
@%p65 bra $L__BB0_91;
mad.lo.s32 %r369, %r556, %r3, %r39;
mul.wide.s32 %rd111, %r369, 4;
add.s64 %rd113, %rd50, %rd111;
ld.shared.f32 %f323, [%rd25];
ld.shared.f32 %f324, [%rd113];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd25], %f325;
$L__BB0_91:
bar.sync 0;
shr.u32 %r55, %r556, 1;
setp.gt.u32 %p66, %r556, 3;
mov.u32 %r556, %r55;
@%p66 bra $L__BB0_89;
$L__BB0_92:
mov.u32 %r557, 0;
@%p49 bra $L__BB0_96;
setp.lt.u32 %p68, %r4, 2;
ld.shared.f32 %f326, [%rd25];
add.f32 %f499, %f326, 0f00000000;
@%p68 bra $L__BB0_95;
ld.shared.f32 %f327, [%rd27];
add.f32 %f499, %f499, %f327;
$L__BB0_95:
mov.b32 %r557, %f499;
$L__BB0_96:
bar.sync 0;
st.shared.f32 [%rd25], %f488;
bar.sync 0;
@%p45 bra $L__BB0_98;
ld.shared.f32 %f328, [%rd26];
ld.shared.f32 %f329, [%rd25];
add.f32 %f330, %f328, %f329;
st.shared.f32 [%rd25], %f330;
$L__BB0_98:
bar.sync 0;
@%p46 bra $L__BB0_103;
mov.u32 %r558, %r572;
$L__BB0_100:
setp.ge.u32 %p71, %r8, %r558;
@%p71 bra $L__BB0_102;
mad.lo.s32 %r371, %r558, %r3, %r39;
mul.wide.s32 %rd114, %r371, 4;
add.s64 %rd116, %rd50, %rd114;
ld.shared.f32 %f331, [%rd25];
ld.shared.f32 %f332, [%rd116];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd25], %f333;
$L__BB0_102:
bar.sync 0;
shr.u32 %r59, %r558, 1;
setp.gt.u32 %p72, %r558, 3;
mov.u32 %r558, %r59;
@%p72 bra $L__BB0_100;
$L__BB0_103:
mov.u32 %r559, 0;
@%p49 bra $L__BB0_107;
setp.lt.u32 %p74, %r4, 2;
ld.shared.f32 %f334, [%rd25];
add.f32 %f500, %f334, 0f00000000;
@%p74 bra $L__BB0_106;
ld.shared.f32 %f335, [%rd27];
add.f32 %f500, %f500, %f335;
$L__BB0_106:
mov.b32 %r559, %f500;
$L__BB0_107:
bar.sync 0;
st.shared.f32 [%rd25], %f489;
bar.sync 0;
@%p45 bra $L__BB0_109;
ld.shared.f32 %f336, [%rd26];
ld.shared.f32 %f337, [%rd25];
add.f32 %f338, %f336, %f337;
st.shared.f32 [%rd25], %f338;
$L__BB0_109:
bar.sync 0;
@%p46 bra $L__BB0_114;
mov.u32 %r560, %r572;
$L__BB0_111:
setp.ge.u32 %p77, %r8, %r560;
@%p77 bra $L__BB0_113;
mad.lo.s32 %r373, %r560, %r3, %r39;
mul.wide.s32 %rd117, %r373, 4;
add.s64 %rd119, %rd50, %rd117;
ld.shared.f32 %f339, [%rd25];
ld.shared.f32 %f340, [%rd119];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd25], %f341;
$L__BB0_113:
bar.sync 0;
shr.u32 %r63, %r560, 1;
setp.gt.u32 %p78, %r560, 3;
mov.u32 %r560, %r63;
@%p78 bra $L__BB0_111;
$L__BB0_114:
mov.u32 %r561, 0;
@%p49 bra $L__BB0_118;
setp.lt.u32 %p80, %r4, 2;
ld.shared.f32 %f342, [%rd25];
add.f32 %f501, %f342, 0f00000000;
@%p80 bra $L__BB0_117;
ld.shared.f32 %f343, [%rd27];
add.f32 %f501, %f501, %f343;
$L__BB0_117:
mov.b32 %r561, %f501;
$L__BB0_118:
bar.sync 0;
st.shared.f32 [%rd25], %f490;
bar.sync 0;
@%p45 bra $L__BB0_120;
ld.shared.f32 %f344, [%rd26];
ld.shared.f32 %f345, [%rd25];
add.f32 %f346, %f344, %f345;
st.shared.f32 [%rd25], %f346;
$L__BB0_120:
bar.sync 0;
@%p46 bra $L__BB0_125;
mov.u32 %r562, %r572;
$L__BB0_122:
setp.ge.u32 %p83, %r8, %r562;
@%p83 bra $L__BB0_124;
mad.lo.s32 %r375, %r562, %r3, %r39;
mul.wide.s32 %rd120, %r375, 4;
add.s64 %rd122, %rd50, %rd120;
ld.shared.f32 %f347, [%rd25];
ld.shared.f32 %f348, [%rd122];
add.f32 %f349, %f348, %f347;
st.shared.f32 [%rd25], %f349;
$L__BB0_124:
bar.sync 0;
shr.u32 %r67, %r562, 1;
setp.gt.u32 %p84, %r562, 3;
mov.u32 %r562, %r67;
@%p84 bra $L__BB0_122;
$L__BB0_125:
mov.u32 %r563, 0;
@%p49 bra $L__BB0_129;
setp.lt.u32 %p86, %r4, 2;
ld.shared.f32 %f350, [%rd25];
add.f32 %f502, %f350, 0f00000000;
@%p86 bra $L__BB0_128;
ld.shared.f32 %f351, [%rd27];
add.f32 %f502, %f502, %f351;
$L__BB0_128:
mov.b32 %r563, %f502;
$L__BB0_129:
bar.sync 0;
st.shared.f32 [%rd25], %f491;
bar.sync 0;
@%p45 bra $L__BB0_131;
ld.shared.f32 %f352, [%rd26];
ld.shared.f32 %f353, [%rd25];
add.f32 %f354, %f352, %f353;
st.shared.f32 [%rd25], %f354;
$L__BB0_131:
bar.sync 0;
@%p46 bra $L__BB0_136;
mov.u32 %r564, %r572;
$L__BB0_133:
setp.ge.u32 %p89, %r8, %r564;
@%p89 bra $L__BB0_135;
mad.lo.s32 %r377, %r564, %r3, %r39;
mul.wide.s32 %rd123, %r377, 4;
add.s64 %rd125, %rd50, %rd123;
ld.shared.f32 %f355, [%rd25];
ld.shared.f32 %f356, [%rd125];
add.f32 %f357, %f356, %f355;
st.shared.f32 [%rd25], %f357;
$L__BB0_135:
bar.sync 0;
shr.u32 %r71, %r564, 1;
setp.gt.u32 %p90, %r564, 3;
mov.u32 %r564, %r71;
@%p90 bra $L__BB0_133;
$L__BB0_136:
mov.u32 %r565, 0;
@%p49 bra $L__BB0_140;
setp.lt.u32 %p92, %r4, 2;
ld.shared.f32 %f358, [%rd25];
add.f32 %f503, %f358, 0f00000000;
@%p92 bra $L__BB0_139;
ld.shared.f32 %f359, [%rd27];
add.f32 %f503, %f503, %f359;
$L__BB0_139:
mov.b32 %r565, %f503;
$L__BB0_140:
bar.sync 0;
st.shared.f32 [%rd25], %f492;
bar.sync 0;
@%p45 bra $L__BB0_142;
ld.shared.f32 %f360, [%rd26];
ld.shared.f32 %f361, [%rd25];
add.f32 %f362, %f360, %f361;
st.shared.f32 [%rd25], %f362;
$L__BB0_142:
bar.sync 0;
@%p46 bra $L__BB0_147;
mov.u32 %r566, %r572;
$L__BB0_144:
setp.ge.u32 %p95, %r8, %r566;
@%p95 bra $L__BB0_146;
mad.lo.s32 %r379, %r566, %r3, %r39;
mul.wide.s32 %rd126, %r379, 4;
add.s64 %rd128, %rd50, %rd126;
ld.shared.f32 %f363, [%rd25];
ld.shared.f32 %f364, [%rd128];
add.f32 %f365, %f364, %f363;
st.shared.f32 [%rd25], %f365;
$L__BB0_146:
bar.sync 0;
shr.u32 %r75, %r566, 1;
setp.gt.u32 %p96, %r566, 3;
mov.u32 %r566, %r75;
@%p96 bra $L__BB0_144;
$L__BB0_147:
mov.u32 %r567, 0;
@%p49 bra $L__BB0_151;
setp.lt.u32 %p98, %r4, 2;
ld.shared.f32 %f366, [%rd25];
add.f32 %f504, %f366, 0f00000000;
@%p98 bra $L__BB0_150;
ld.shared.f32 %f367, [%rd27];
add.f32 %f504, %f504, %f367;
$L__BB0_150:
mov.b32 %r567, %f504;
$L__BB0_151:
bar.sync 0;
st.shared.f32 [%rd25], %f493;
bar.sync 0;
@%p45 bra $L__BB0_153;
ld.shared.f32 %f368, [%rd26];
ld.shared.f32 %f369, [%rd25];
add.f32 %f370, %f368, %f369;
st.shared.f32 [%rd25], %f370;
$L__BB0_153:
bar.sync 0;
@%p46 bra $L__BB0_158;
mov.u32 %r568, %r572;
$L__BB0_155:
setp.ge.u32 %p101, %r8, %r568;
@%p101 bra $L__BB0_157;
mad.lo.s32 %r381, %r568, %r3, %r39;
mul.wide.s32 %rd129, %r381, 4;
add.s64 %rd131, %rd50, %rd129;
ld.shared.f32 %f371, [%rd25];
ld.shared.f32 %f372, [%rd131];
add.f32 %f373, %f372, %f371;
st.shared.f32 [%rd25], %f373;
$L__BB0_157:
bar.sync 0;
shr.u32 %r79, %r568, 1;
setp.gt.u32 %p102, %r568, 3;
mov.u32 %r568, %r79;
@%p102 bra $L__BB0_155;
$L__BB0_158:
mov.u32 %r569, 0;
@%p49 bra $L__BB0_162;
setp.lt.u32 %p104, %r4, 2;
ld.shared.f32 %f374, [%rd25];
add.f32 %f505, %f374, 0f00000000;
@%p104 bra $L__BB0_161;
ld.shared.f32 %f375, [%rd27];
add.f32 %f505, %f505, %f375;
$L__BB0_161:
mov.b32 %r569, %f505;
$L__BB0_162:
bar.sync 0;
st.shared.f32 [%rd25], %f494;
bar.sync 0;
@%p45 bra $L__BB0_164;
ld.shared.f32 %f376, [%rd26];
ld.shared.f32 %f377, [%rd25];
add.f32 %f378, %f376, %f377;
st.shared.f32 [%rd25], %f378;
$L__BB0_164:
bar.sync 0;
@%p46 bra $L__BB0_169;
mov.u32 %r570, %r572;
$L__BB0_166:
setp.ge.u32 %p107, %r8, %r570;
@%p107 bra $L__BB0_168;
mad.lo.s32 %r383, %r570, %r3, %r39;
mul.wide.s32 %rd132, %r383, 4;
add.s64 %rd134, %rd50, %rd132;
ld.shared.f32 %f379, [%rd25];
ld.shared.f32 %f380, [%rd134];
add.f32 %f381, %f380, %f379;
st.shared.f32 [%rd25], %f381;
$L__BB0_168:
bar.sync 0;
shr.u32 %r83, %r570, 1;
setp.gt.u32 %p108, %r570, 3;
mov.u32 %r570, %r83;
@%p108 bra $L__BB0_166;
$L__BB0_169:
mov.u32 %r571, 0;
@%p49 bra $L__BB0_173;
setp.lt.u32 %p110, %r4, 2;
ld.shared.f32 %f382, [%rd25];
add.f32 %f506, %f382, 0f00000000;
@%p110 bra $L__BB0_172;
ld.shared.f32 %f383, [%rd27];
add.f32 %f506, %f506, %f383;
$L__BB0_172:
mov.b32 %r571, %f506;
$L__BB0_173:
bar.sync 0;
st.shared.f32 [%rd25], %f495;
bar.sync 0;
@%p45 bra $L__BB0_175;
ld.shared.f32 %f384, [%rd26];
ld.shared.f32 %f385, [%rd25];
add.f32 %f386, %f384, %f385;
st.shared.f32 [%rd25], %f386;
$L__BB0_175:
bar.sync 0;
@%p46 bra $L__BB0_179;
$L__BB0_176:
setp.ge.u32 %p113, %r8, %r572;
@%p113 bra $L__BB0_178;
mad.lo.s32 %r385, %r572, %r3, %r39;
mul.wide.s32 %rd135, %r385, 4;
add.s64 %rd137, %rd50, %rd135;
ld.shared.f32 %f387, [%rd25];
ld.shared.f32 %f388, [%rd137];
add.f32 %f389, %f388, %f387;
st.shared.f32 [%rd25], %f389;
$L__BB0_178:
bar.sync 0;
shr.u32 %r87, %r572, 1;
setp.gt.u32 %p114, %r572, 3;
mov.u32 %r572, %r87;
@%p114 bra $L__BB0_176;
$L__BB0_179:
mov.u32 %r573, 0;
@%p49 bra $L__BB0_183;
setp.lt.u32 %p116, %r4, 2;
ld.shared.f32 %f390, [%rd25];
add.f32 %f507, %f390, 0f00000000;
@%p116 bra $L__BB0_182;
ld.shared.f32 %f391, [%rd27];
add.f32 %f507, %f507, %f391;
$L__BB0_182:
mov.b32 %r573, %f507;
$L__BB0_183:
setp.eq.s32 %p188, %r8, 0;
and.pred %p187, %p188, %p1;
bar.sync 0;
@%p187 bra $L__BB0_184;
bra.uni $L__BB0_185;
$L__BB0_184:
mov.u32 %r399, %ctaid.y;
mad.lo.s32 %r400, %r211, %r399, %r7;
mul.wide.s32 %rd141, %r400, 4;
add.s64 %rd138, %rd46, %rd141;
// begin inline asm
st.volatile.global.v4.s32 [%rd138], {%r551,%r553,%r555,%r557};
// end inline asm
add.s64 %rd139, %rd47, %rd141;
// begin inline asm
st.volatile.global.v4.s32 [%rd139], {%r559,%r561,%r563,%r565};
// end inline asm
add.s64 %rd140, %rd48, %rd141;
// begin inline asm
st.volatile.global.v4.s32 [%rd140], {%r567,%r569,%r571,%r573};
// end inline asm
$L__BB0_185:
mov.u32 %r90, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r401, %r6, %r8;
or.b32 %r403, %r401, %r351;
setp.ne.s32 %p117, %r403, 0;
@%p117 bra $L__BB0_189;
ld.param.u64 %rd189, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_14];
cvta.to.global.u64 %rd142, %rd189;
mov.u32 %r404, %ctaid.x;
mov.u32 %r405, %ctaid.z;
mov.u32 %r406, %nctaid.x;
mad.lo.s32 %r407, %r405, %r406, %r404;
mul.wide.s32 %rd143, %r407, 8;
add.s64 %rd31, %rd142, %rd143;
add.s32 %r408, %r9, -1;
setp.eq.s32 %p118, %r90, %r408;
cvt.s64.s32 %rd144, %r9;
mov.u64 %rd145, -9223372036854775807;
sub.s64 %rd146, %rd145, %rd144;
selp.b64 %rd147, %rd146, 1, %p118;
atom.global.add.u64 %rd32, [%rd31], %rd147;
ld.volatile.global.u64 %rd148, [%rd31];
xor.b64 %rd149, %rd148, %rd32;
setp.lt.s64 %p119, %rd149, 0;
@%p119 bra $L__BB0_189;
mov.u32 %r574, 8;
$L__BB0_188:
// begin inline asm
nanosleep.u32 %r574;
// end inline asm
setp.lt.u32 %p120, %r574, 256;
selp.u32 %r411, 1, 0, %p120;
shl.b32 %r574, %r574, %r411;
ld.volatile.global.u64 %rd150, [%rd31];
xor.b64 %rd151, %rd150, %rd32;
setp.gt.s64 %p121, %rd151, -1;
@%p121 bra $L__BB0_188;
$L__BB0_189:
bar.sync 0;
add.s32 %r412, %r9, %r3;
add.s32 %r413, %r412, -1;
div.s32 %r93, %r413, %r3;
setp.lt.s32 %p122, %r93, 1;
mov.f32 %f510, 0f00000000;
mov.f32 %f511, %f510;
@%p122 bra $L__BB0_195;
add.s32 %r415, %r211, 1;
shr.u32 %r416, %r415, 31;
add.s32 %r417, %r415, %r416;
shr.s32 %r418, %r417, 1;
add.s32 %r419, %r4, %r418;
add.s32 %r420, %r419, -1;
shl.b32 %r421, %r8, 1;
shl.b32 %r422, %r4, 1;
mad.lo.s32 %r423, %r422, %r90, %r421;
or.b32 %r424, %r423, 1;
setp.ge.s32 %p123, %r424, %r211;
div.s32 %r425, %r420, %r4;
setp.ge.s32 %p124, %r90, %r425;
or.pred %p6, %p124, %p123;
mul.lo.s32 %r426, %r4, %r90;
shl.b32 %r427, %r426, 1;
mad.lo.s32 %r428, %r211, %r6, %r427;
add.s32 %r576, %r428, %r421;
mul.lo.s32 %r95, %r211, %r3;
mov.u32 %r414, 0;
mov.f32 %f510, 0f00000000;
mov.u32 %r575, %r6;
mov.u32 %r577, %r414;
$L__BB0_191:
.pragma "nounroll";
mov.u32 %r578, %r414;
mov.u32 %r579, %r414;
@%p6 bra $L__BB0_194;
setp.ge.s32 %p125, %r575, %r9;
mov.u32 %r578, %r414;
mov.u32 %r579, %r414;
@%p125 bra $L__BB0_194;
mul.wide.s32 %rd153, %r576, 4;
add.s64 %rd152, %rd48, %rd153;
// begin inline asm
ld.volatile.global.v2.s32 {%r579,%r578}, [%rd152];
// end inline asm
$L__BB0_194:
mov.b32 %f396, %r579;
add.f32 %f510, %f510, %f396;
mov.b32 %f397, %r578;
add.f32 %f511, %f511, %f397;
add.s32 %r576, %r576, %r95;
add.s32 %r575, %r575, %r3;
add.s32 %r577, %r577, 1;
setp.lt.s32 %p126, %r577, %r93;
@%p126 bra $L__BB0_191;
$L__BB0_195:
clz.b32 %r435, %r3;
mov.u32 %r436, 31;
sub.s32 %r437, %r436, %r435;
mov.u32 %r438, 1;
shl.b32 %r106, %r438, %r437;
setp.lt.u32 %p127, %r6, %r106;
add.s32 %r439, %r106, %r6;
setp.lt.u32 %p128, %r439, %r3;
and.pred %p7, %p127, %p128;
add.s32 %r440, %r39, %r106;
mul.wide.s32 %rd154, %r440, 4;
add.s64 %rd33, %rd50, %rd154;
shr.u32 %r441, %r106, 31;
add.s32 %r442, %r106, %r441;
shr.s32 %r600, %r442, 1;
st.shared.f32 [%rd25], %f510;
bar.sync 0;
not.pred %p129, %p7;
@%p129 bra $L__BB0_197;
ld.shared.f32 %f398, [%rd33];
ld.shared.f32 %f399, [%rd25];
add.f32 %f400, %f398, %f399;
st.shared.f32 [%rd25], %f400;
$L__BB0_197:
setp.lt.s32 %p130, %r106, 4;
bar.sync 0;
@%p130 bra $L__BB0_202;
mov.u32 %r580, %r600;
$L__BB0_199:
setp.ge.u32 %p131, %r6, %r580;
@%p131 bra $L__BB0_201;
add.s32 %r443, %r580, %r39;
mul.wide.s32 %rd156, %r443, 4;
add.s64 %rd158, %rd50, %rd156;
ld.shared.f32 %f401, [%rd25];
ld.shared.f32 %f402, [%rd158];
add.f32 %f403, %f402, %f401;
st.shared.f32 [%rd25], %f403;
$L__BB0_201:
bar.sync 0;
shr.u32 %r109, %r580, 1;
setp.gt.u32 %p132, %r580, 3;
mov.u32 %r580, %r109;
@%p132 bra $L__BB0_199;
$L__BB0_202:
add.s32 %r445, %r39, 1;
mul.wide.u32 %rd159, %r445, 4;
add.s64 %rd34, %rd50, %rd159;
setp.ne.s32 %p133, %r6, 0;
mov.u32 %r581, 0;
@%p133 bra $L__BB0_206;
setp.lt.u32 %p134, %r3, 2;
ld.shared.f32 %f404, [%rd25];
add.f32 %f512, %f404, 0f00000000;
@%p134 bra $L__BB0_205;
ld.shared.f32 %f405, [%rd34];
add.f32 %f512, %f512, %f405;
$L__BB0_205:
mov.b32 %r581, %f512;
$L__BB0_206:
bar.sync 0;
st.shared.f32 [%rd25], %f511;
bar.sync 0;
@%p129 bra $L__BB0_208;
ld.shared.f32 %f406, [%rd33];
ld.shared.f32 %f407, [%rd25];
add.f32 %f408, %f406, %f407;
st.shared.f32 [%rd25], %f408;
$L__BB0_208:
bar.sync 0;
@%p130 bra $L__BB0_213;
mov.u32 %r582, %r600;
$L__BB0_210:
setp.ge.u32 %p137, %r6, %r582;
@%p137 bra $L__BB0_212;
add.s32 %r446, %r582, %r39;
mul.wide.s32 %rd161, %r446, 4;
add.s64 %rd163, %rd50, %rd161;
ld.shared.f32 %f409, [%rd25];
ld.shared.f32 %f410, [%rd163];
add.f32 %f411, %f410, %f409;
st.shared.f32 [%rd25], %f411;
$L__BB0_212:
bar.sync 0;
shr.u32 %r113, %r582, 1;
setp.gt.u32 %p138, %r582, 3;
mov.u32 %r582, %r113;
@%p138 bra $L__BB0_210;
$L__BB0_213:
mov.u32 %r583, 0;
@%p133 bra $L__BB0_217;
setp.lt.u32 %p140, %r3, 2;
ld.shared.f32 %f412, [%rd25];
add.f32 %f513, %f412, 0f00000000;
@%p140 bra $L__BB0_216;
ld.shared.f32 %f413, [%rd34];
add.f32 %f513, %f513, %f413;
$L__BB0_216:
mov.b32 %r583, %f513;
$L__BB0_217:
bar.sync 0;
@%p133 bra $L__BB0_221;
add.s32 %r448, %r211, 1;
shr.u32 %r449, %r448, 31;
add.s32 %r450, %r448, %r449;
shr.s32 %r451, %r450, 1;
add.s32 %r452, %r4, %r451;
add.s32 %r453, %r452, -1;
div.s32 %r454, %r453, %r4;
setp.ge.s32 %p142, %r90, %r454;
@%p142 bra $L__BB0_221;
shl.b32 %r116, %r8, 1;
mul.lo.s32 %r455, %r4, %r90;
shl.b32 %r117, %r455, 1;
add.s32 %r456, %r116, %r117;
or.b32 %r457, %r456, 1;
setp.ge.s32 %p143, %r457, %r211;
@%p143 bra $L__BB0_221;
ld.param.u64 %rd188, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10];
add.s32 %r460, %r117, %r116;
mul.wide.s32 %rd165, %r460, 4;
add.s64 %rd164, %rd188, %rd165;
// begin inline asm
st.global.cs.v2.s32 [%rd164], {%r581,%r583};
// end inline asm
$L__BB0_221:
mov.f32 %f516, 0f00000000;
mov.f32 %f517, %f516;
@%p122 bra $L__BB0_227;
add.s32 %r462, %r211, 1;
shr.u32 %r463, %r462, 31;
add.s32 %r464, %r462, %r463;
shr.s32 %r465, %r464, 1;
add.s32 %r466, %r4, %r465;
add.s32 %r467, %r466, -1;
shl.b32 %r468, %r8, 1;
shl.b32 %r469, %r4, 1;
mad.lo.s32 %r470, %r469, %r90, %r468;
or.b32 %r471, %r470, 1;
setp.ge.s32 %p145, %r471, %r211;
div.s32 %r472, %r467, %r4;
setp.ge.s32 %p146, %r90, %r472;
or.pred %p8, %p146, %p145;
mul.lo.s32 %r473, %r4, %r90;
shl.b32 %r474, %r473, 1;
mad.lo.s32 %r475, %r211, %r6, %r474;
add.s32 %r585, %r475, %r468;
mul.lo.s32 %r119, %r211, %r3;
mov.u32 %r461, 0;
mov.f32 %f516, 0f00000000;
mov.u32 %r584, %r6;
mov.u32 %r586, %r461;
$L__BB0_223:
.pragma "nounroll";
mov.u32 %r587, %r461;
mov.u32 %r588, %r461;
@%p8 bra $L__BB0_226;
setp.ge.s32 %p147, %r584, %r9;
mov.u32 %r587, %r461;
mov.u32 %r588, %r461;
@%p147 bra $L__BB0_226;
mul.wide.s32 %rd167, %r585, 4;
add.s64 %rd166, %rd47, %rd167;
// begin inline asm
ld.volatile.global.v2.s32 {%r588,%r587}, [%rd166];
// end inline asm
$L__BB0_226:
mov.b32 %f418, %r588;
add.f32 %f516, %f516, %f418;
mov.b32 %f419, %r587;
add.f32 %f517, %f517, %f419;
add.s32 %r585, %r585, %r119;
add.s32 %r584, %r584, %r3;
add.s32 %r586, %r586, 1;
setp.lt.s32 %p148, %r586, %r93;
@%p148 bra $L__BB0_223;
$L__BB0_227:
st.shared.f32 [%rd25], %f516;
bar.sync 0;
@%p129 bra $L__BB0_229;
ld.shared.f32 %f420, [%rd33];
ld.shared.f32 %f421, [%rd25];
add.f32 %f422, %f420, %f421;
st.shared.f32 [%rd25], %f422;
$L__BB0_229:
bar.sync 0;
@%p130 bra $L__BB0_234;
mov.u32 %r589, %r600;
$L__BB0_231:
setp.ge.u32 %p151, %r6, %r589;
@%p151 bra $L__BB0_233;
add.s32 %r482, %r589, %r39;
mul.wide.s32 %rd168, %r482, 4;
add.s64 %rd170, %rd50, %rd168;
ld.shared.f32 %f423, [%rd25];
ld.shared.f32 %f424, [%rd170];
add.f32 %f425, %f424, %f423;
st.shared.f32 [%rd25], %f425;
$L__BB0_233:
bar.sync 0;
shr.u32 %r131, %r589, 1;
setp.gt.u32 %p152, %r589, 3;
mov.u32 %r589, %r131;
@%p152 bra $L__BB0_231;
$L__BB0_234:
mov.u32 %r590, 0;
@%p133 bra $L__BB0_238;
setp.lt.u32 %p154, %r3, 2;
ld.shared.f32 %f426, [%rd25];
add.f32 %f518, %f426, 0f00000000;
@%p154 bra $L__BB0_237;
ld.shared.f32 %f427, [%rd34];
add.f32 %f518, %f518, %f427;
$L__BB0_237:
mov.b32 %r590, %f518;
$L__BB0_238:
bar.sync 0;
st.shared.f32 [%rd25], %f517;
bar.sync 0;
@%p129 bra $L__BB0_240;
ld.shared.f32 %f428, [%rd33];
ld.shared.f32 %f429, [%rd25];
add.f32 %f430, %f428, %f429;
st.shared.f32 [%rd25], %f430;
$L__BB0_240:
bar.sync 0;
@%p130 bra $L__BB0_245;
mov.u32 %r591, %r600;
$L__BB0_242:
setp.ge.u32 %p157, %r6, %r591;
@%p157 bra $L__BB0_244;
add.s32 %r484, %r591, %r39;
mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd50, %rd171;
ld.shared.f32 %f431, [%rd25];
ld.shared.f32 %f432, [%rd173];
add.f32 %f433, %f432, %f431;
st.shared.f32 [%rd25], %f433;
$L__BB0_244:
bar.sync 0;
shr.u32 %r135, %r591, 1;
setp.gt.u32 %p158, %r591, 3;
mov.u32 %r591, %r135;
@%p158 bra $L__BB0_242;
$L__BB0_245:
mov.u32 %r592, 0;
@%p133 bra $L__BB0_249;
setp.lt.u32 %p160, %r3, 2;
ld.shared.f32 %f434, [%rd25];
add.f32 %f519, %f434, 0f00000000;
@%p160 bra $L__BB0_248;
ld.shared.f32 %f435, [%rd34];
add.f32 %f519, %f519, %f435;
$L__BB0_248:
mov.b32 %r592, %f519;
$L__BB0_249:
bar.sync 0;
@%p133 bra $L__BB0_253;
add.s32 %r486, %r211, 1;
shr.u32 %r487, %r486, 31;
add.s32 %r488, %r486, %r487;
shr.s32 %r489, %r488, 1;
add.s32 %r490, %r4, %r489;
add.s32 %r491, %r490, -1;
div.s32 %r492, %r491, %r4;
setp.ge.s32 %p162, %r90, %r492;
@%p162 bra $L__BB0_253;
shl.b32 %r138, %r8, 1;
mul.lo.s32 %r493, %r4, %r90;
shl.b32 %r139, %r493, 1;
add.s32 %r494, %r138, %r139;
or.b32 %r495, %r494, 1;
setp.ge.s32 %p163, %r495, %r211;
@%p163 bra $L__BB0_253;
ld.param.u64 %rd187, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9];
add.s32 %r498, %r139, %r138;
mul.wide.s32 %rd175, %r498, 4;
add.s64 %rd174, %rd187, %rd175;
// begin inline asm
st.global.cs.v2.s32 [%rd174], {%r590,%r592};
// end inline asm
$L__BB0_253:
mov.f32 %f522, 0f00000000;
mov.f32 %f523, %f522;
@%p122 bra $L__BB0_259;
add.s32 %r500, %r211, 1;
shr.u32 %r501, %r500, 31;
add.s32 %r502, %r500, %r501;
shr.s32 %r503, %r502, 1;
add.s32 %r504, %r4, %r503;
add.s32 %r505, %r504, -1;
shl.b32 %r506, %r8, 1;
shl.b32 %r507, %r4, 1;
mad.lo.s32 %r508, %r507, %r90, %r506;
or.b32 %r509, %r508, 1;
setp.ge.s32 %p165, %r509, %r211;
div.s32 %r510, %r505, %r4;
setp.ge.s32 %p166, %r90, %r510;
or.pred %p9, %p166, %p165;
mul.lo.s32 %r511, %r4, %r90;
shl.b32 %r512, %r511, 1;
mad.lo.s32 %r513, %r211, %r6, %r512;
add.s32 %r594, %r513, %r506;
mul.lo.s32 %r141, %r211, %r3;
mov.u32 %r499, 0;
mov.f32 %f522, 0f00000000;
mov.u32 %r593, %r6;
mov.u32 %r595, %r499;
$L__BB0_255:
.pragma "nounroll";
mov.u32 %r596, %r499;
mov.u32 %r597, %r499;
@%p9 bra $L__BB0_258;
setp.ge.s32 %p167, %r593, %r9;
mov.u32 %r596, %r499;
mov.u32 %r597, %r499;
@%p167 bra $L__BB0_258;
mul.wide.s32 %rd177, %r594, 4;
add.s64 %rd176, %rd46, %rd177;
// begin inline asm
ld.volatile.global.v2.s32 {%r597,%r596}, [%rd176];
// end inline asm
$L__BB0_258:
mov.b32 %f440, %r597;
add.f32 %f522, %f522, %f440;
mov.b32 %f441, %r596;
add.f32 %f523, %f523, %f441;
add.s32 %r594, %r594, %r141;
add.s32 %r593, %r593, %r3;
add.s32 %r595, %r595, 1;
setp.lt.s32 %p168, %r595, %r93;
@%p168 bra $L__BB0_255;
$L__BB0_259:
st.shared.f32 [%rd25], %f522;
bar.sync 0;
@%p129 bra $L__BB0_261;
ld.shared.f32 %f442, [%rd33];
ld.shared.f32 %f443, [%rd25];
add.f32 %f444, %f442, %f443;
st.shared.f32 [%rd25], %f444;
$L__BB0_261:
bar.sync 0;
@%p130 bra $L__BB0_266;
mov.u32 %r598, %r600;
$L__BB0_263:
setp.ge.u32 %p171, %r6, %r598;
@%p171 bra $L__BB0_265;
add.s32 %r520, %r598, %r39;
mul.wide.s32 %rd178, %r520, 4;
add.s64 %rd180, %rd50, %rd178;
ld.shared.f32 %f445, [%rd25];
ld.shared.f32 %f446, [%rd180];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd25], %f447;
$L__BB0_265:
bar.sync 0;
shr.u32 %r153, %r598, 1;
setp.gt.u32 %p172, %r598, 3;
mov.u32 %r598, %r153;
@%p172 bra $L__BB0_263;
$L__BB0_266:
mov.u32 %r599, 0;
@%p133 bra $L__BB0_270;
setp.lt.u32 %p174, %r3, 2;
ld.shared.f32 %f448, [%rd25];
add.f32 %f524, %f448, 0f00000000;
@%p174 bra $L__BB0_269;
ld.shared.f32 %f449, [%rd34];
add.f32 %f524, %f524, %f449;
$L__BB0_269:
mov.b32 %r599, %f524;
$L__BB0_270:
bar.sync 0;
st.shared.f32 [%rd25], %f523;
bar.sync 0;
@%p129 bra $L__BB0_272;
ld.shared.f32 %f450, [%rd33];
ld.shared.f32 %f451, [%rd25];
add.f32 %f452, %f450, %f451;
st.shared.f32 [%rd25], %f452;
$L__BB0_272:
bar.sync 0;
@%p130 bra $L__BB0_276;
$L__BB0_273:
setp.ge.u32 %p177, %r6, %r600;
@%p177 bra $L__BB0_275;
add.s32 %r522, %r600, %r39;
mul.wide.s32 %rd181, %r522, 4;
add.s64 %rd183, %rd50, %rd181;
ld.shared.f32 %f453, [%rd25];
ld.shared.f32 %f454, [%rd183];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd25], %f455;
$L__BB0_275:
bar.sync 0;
shr.u32 %r157, %r600, 1;
setp.gt.u32 %p178, %r600, 3;
mov.u32 %r600, %r157;
@%p178 bra $L__BB0_273;
$L__BB0_276:
mov.u32 %r601, 0;
@%p133 bra $L__BB0_280;
setp.lt.u32 %p180, %r3, 2;
ld.shared.f32 %f456, [%rd25];
add.f32 %f525, %f456, 0f00000000;
@%p180 bra $L__BB0_279;
ld.shared.f32 %f457, [%rd34];
add.f32 %f525, %f525, %f457;
$L__BB0_279:
mov.b32 %r601, %f525;
$L__BB0_280:
bar.sync 0;
@%p133 bra $L__BB0_284;
add.s32 %r524, %r211, 1;
shr.u32 %r525, %r524, 31;
add.s32 %r526, %r524, %r525;
shr.s32 %r527, %r526, 1;
add.s32 %r528, %r4, %r527;
add.s32 %r529, %r528, -1;
div.s32 %r530, %r529, %r4;
setp.ge.s32 %p182, %r90, %r530;
@%p182 bra $L__BB0_284;
shl.b32 %r160, %r8, 1;
mul.lo.s32 %r531, %r4, %r90;
shl.b32 %r161, %r531, 1;
add.s32 %r532, %r160, %r161;
or.b32 %r533, %r532, 1;
setp.ge.s32 %p183, %r533, %r211;
@%p183 bra $L__BB0_284;
ld.param.u64 %rd186, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_61_cu_7d6b89bd_723310nvfuser_61ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r536, %r161, %r160;
mul.wide.s32 %rd185, %r536, 4;
add.s64 %rd184, %rd186, %rd185;
// begin inline asm
st.global.cs.v2.s32 [%rd184], {%r599,%r601};
// end inline asm
$L__BB0_284:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -56,110 +56,110 @@
ld.param.u64 %rd35, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r258, %r211, 3;
shr.s32 %r259, %r258, 31;
shr.u32 %r260, %r259, 30;
add.s32 %r261, %r258, %r260;
- shr.s32 %r262, %r261, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r263, %r262, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r264, %r3, 2;
- mad.lo.s32 %r265, %r264, %r263, 15;
- and.b32 %r266, %r265, -16;
- cvt.u64.u32 %rd1, %r266;
- mul.lo.s32 %r267, %r3, %r262;
- shl.b32 %r268, %r267, 4;
- or.b32 %r269, %r268, 15;
- and.b32 %r4, %r269, -16;
- add.s32 %r270, %r269, %r4;
- and.b32 %r271, %r270, -16;
- cvt.s64.s32 %rd2, %r271;
+ shr.s32 %r2, %r261, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r262, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r263, %r4, 2;
+ mad.lo.s32 %r264, %r263, %r262, 15;
+ and.b32 %r265, %r264, -16;
+ cvt.u64.u32 %rd1, %r265;
+ mul.lo.s32 %r266, %r4, %r2;
+ shl.b32 %r267, %r266, 4;
+ or.b32 %r268, %r267, 15;
+ and.b32 %r5, %r268, -16;
+ add.s32 %r269, %r268, %r5;
+ and.b32 %r270, %r269, -16;
+ cvt.s64.s32 %rd2, %r270;
mov.u64 %rd50, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd51, %rd50;
add.s64 %rd3, %rd51, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p10, %r5, %r262;
- shl.b32 %r6, %r5, 2;
- or.b32 %r272, %r6, 3;
- setp.lt.s32 %p11, %r272, %r211;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p10, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r271, %r7, 3;
+ setp.lt.s32 %p11, %r271, %r211;
and.pred %p1, %p11, %p10;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p12, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p12, %r8, 0;
and.pred %p2, %p12, %p1;
not.pred %p13, %p2;
@%p13 bra $L__BB0_2;
add.s64 %rd52, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd52; cvt.u32.u64 %r273, smem_ptr; }
-
-
- shl.b32 %r276, %r5, 4;
- add.s32 %r274, %r273, %r276;
- mul.wide.s32 %rd54, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd52; cvt.u32.u64 %r272, smem_ptr; }
+
+
+ shl.b32 %r275, %r6, 4;
+ add.s32 %r273, %r272, %r275;
+ mul.wide.s32 %rd54, %r7, 4;
add.s64 %rd53, %rd39, %rd54;
- mov.u32 %r275, 0;
+ mov.u32 %r274, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r275, 0;
- cp.async.ca.shared.global [%r274], [%rd53], 16, p0;
+ setp.ne.b32 p0, %r274, 0;
+ cp.async.ca.shared.global [%r273], [%rd53], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r277, %r3, 63;
- div.s32 %r278, %r277, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r279, %r8, %r278;
- add.s32 %r280, %r279, -1;
- div.s32 %r9, %r280, %r8;
- setp.gt.s32 %p14, %r9, 0;
+ add.s32 %r276, %r4, 63;
+ div.s32 %r277, %r276, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r278, %r9, %r277;
+ add.s32 %r279, %r278, -1;
+ div.s32 %r10, %r279, %r9;
+ setp.gt.s32 %p14, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p14 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r211;
- cvt.s64.s32 %rd55, %r4;
+ cvt.s64.s32 %rd55, %r5;
add.s64 %rd56, %rd1, %rd55;
add.s64 %rd58, %rd50, %rd1;
- mov.u32 %r282, %ctaid.y;
- mul.lo.s32 %r283, %r9, %r3;
- mul.lo.s32 %r10, %r283, %r282;
- shl.b32 %r284, %r7, 2;
- shl.b32 %r285, %r5, 4;
- mad.lo.s32 %r11, %r284, %r211, %r285;
- mul.lo.s32 %r286, %r211, %r7;
- cvt.s64.s32 %rd59, %r286;
- cvt.s64.s32 %rd60, %r6;
+ mov.u32 %r281, %ctaid.y;
+ mul.lo.s32 %r282, %r10, %r4;
+ mul.lo.s32 %r11, %r282, %r281;
+ mad.lo.s32 %r283, %r2, %r8, %r6;
+ shl.b32 %r12, %r283, 4;
+ mul.lo.s32 %r284, %r211, %r8;
+ cvt.s64.s32 %rd59, %r284;
+ cvt.s64.s32 %rd60, %r7;
add.s64 %rd5, %rd59, %rd60;
- mul.lo.s32 %r287, %r10, %r211;
- cvt.s64.s32 %rd6, %r287;
- mul.lo.s32 %r12, %r211, %r3;
- mul.lo.s32 %r13, %r9, %r282;
- add.s32 %r14, %r286, %r6;
+ mul.lo.s32 %r285, %r11, %r211;
+ cvt.s64.s32 %rd6, %r285;
+ mul.lo.s32 %r13, %r211, %r4;
+ mul.lo.s32 %r14, %r10, %r281;
+ shl.b32 %r286, %r8, 2;
+ mad.lo.s32 %r287, %r286, %r2, %r7;
add.s64 %rd61, %rd50, %rd56;
- mul.wide.s32 %rd62, %r14, 4;
+ mul.wide.s32 %rd62, %r287, 4;
add.s64 %rd7, %rd61, %rd62;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r288, %tid.z;
- mad.lo.s32 %r289, %r3, %r288, %r7;
- mad.lo.s32 %r15, %r289, %r2, %r5;
+ mad.lo.s32 %r289, %r4, %r288, %r8;
+ mad.lo.s32 %r15, %r289, %r3, %r6;
mul.wide.u32 %rd63, %r15, 4;
add.s64 %rd8, %rd50, %rd63;
- clz.b32 %r290, %r2;
+ clz.b32 %r290, %r3;
mov.u32 %r291, 31;
sub.s32 %r292, %r291, %r290;
mov.u32 %r293, 1;
shl.b32 %r16, %r293, %r292;
- setp.lt.u32 %p15, %r5, %r16;
- add.s32 %r294, %r16, %r5;
- setp.lt.u32 %p16, %r294, %r2;
+ setp.lt.u32 %p15, %r6, %r16;
+ add.s32 %r294, %r16, %r6;
+ setp.lt.u32 %p16, %r294, %r3;
and.pred %p3, %p15, %p16;
add.s32 %r295, %r15, %r16;
mul.wide.s32 %rd64, %r295, 4;
add.s64 %rd9, %rd50, %rd64;
shr.u32 %r296, %r16, 31;
@@ -168,11 +168,11 @@
add.s64 %rd10, %rd58, %rd62;
add.s32 %r298, %r15, 1;
mul.wide.u32 %rd65, %r298, 4;
add.s64 %rd11, %rd50, %rd65;
add.s64 %rd66, %rd50, %rd4;
- mul.wide.s32 %rd67, %r6, 4;
+ mul.wide.s32 %rd67, %r7, 4;
add.s64 %rd12, %rd66, %rd67;
mul.wide.s32 %rd68, %r289, 4;
add.s64 %rd13, %rd50, %rd68;
add.s64 %rd14, %rd40, %rd67;
cvt.rn.f32.f64 %f2, %fd1;
@@ -184,16 +184,16 @@
not.pred %p17, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd21; cvt.u32.u64 %r301, smem_ptr; }
- add.s32 %r302, %r11, %r301;
+ add.s32 %r302, %r301, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r327, smem_ptr; }
- add.s32 %r328, %r11, %r327;
+ add.s32 %r328, %r327, %r12;
not.pred %p27, %p3;
mov.f32 %f481, %f480;
mov.f32 %f482, %f480;
mov.f32 %f483, %f480;
mov.f32 %f488, %f480;
@@ -207,16 +207,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p17 bra $L__BB0_8;
- mad.lo.s32 %r299, %r543, %r3, %r7;
- add.s32 %r300, %r299, %r10;
+ mad.lo.s32 %r299, %r543, %r4, %r8;
+ add.s32 %r300, %r299, %r11;
setp.gt.s32 %p18, %r300, 63;
@%p18 bra $L__BB0_8;
- mul.lo.s32 %r304, %r12, %r543;
+ mul.lo.s32 %r304, %r13, %r543;
cvt.s64.s32 %rd72, %r304;
add.s64 %rd73, %rd5, %rd72;
add.s64 %rd74, %rd73, %rd6;
shl.b64 %rd75, %rd74, 2;
add.s64 %rd71, %rd35, %rd75;
@@ -235,12 +235,12 @@
cp.async.wait_all;
@%p17 bra $L__BB0_10;
- add.s32 %r305, %r13, %r543;
- mad.lo.s32 %r306, %r305, %r3, %r7;
+ add.s32 %r305, %r14, %r543;
+ mad.lo.s32 %r306, %r305, %r4, %r8;
setp.lt.s32 %p20, %r306, 64;
@%p20 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
@@ -250,12 +250,12 @@
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r315, %r13, %r543;
- mad.lo.s32 %r316, %r315, %r3, %r7;
+ add.s32 %r315, %r14, %r543;
+ mad.lo.s32 %r316, %r315, %r4, %r8;
setp.gt.s32 %p21, %r316, 63;
mov.u32 %r544, 0;
mov.u32 %r545, %r544;
mov.u32 %r546, %r544;
mov.u32 %r547, %r544;
@@ -269,12 +269,12 @@
mov.u32 %r545, %r544;
mov.u32 %r546, %r544;
mov.u32 %r547, %r544;
$L__BB0_15:
- add.s32 %r325, %r13, %r543;
- mad.lo.s32 %r33, %r325, %r3, %r7;
+ add.s32 %r325, %r14, %r543;
+ mad.lo.s32 %r33, %r325, %r4, %r8;
mov.b32 %f160, %r547;
add.f32 %f495, %f495, %f160;
mov.b32 %f161, %r546;
add.f32 %f494, %f494, %f161;
mov.b32 %f162, %r545;
@@ -289,16 +289,18 @@
mul.wide.s32 %rd76, %r326, 4;
add.s64 %rd77, %rd17, %rd76;
ld.global.f32 %f470, [%rd77];
$L__BB0_17:
- setp.lt.s32 %p23, %r33, 64;
+ add.s32 %r540, %r14, %r543;
+ mad.lo.s32 %r539, %r540, %r4, %r8;
+ setp.lt.s32 %p23, %r539, 64;
and.pred %p4, %p1, %p23;
not.pred %p24, %p4;
@%p24 bra $L__BB0_19;
- mul.lo.s32 %r330, %r12, %r543;
+ mul.lo.s32 %r330, %r13, %r543;
cvt.s64.s32 %rd80, %r330;
add.s64 %rd81, %rd5, %rd80;
add.s64 %rd82, %rd81, %rd6;
shl.b64 %rd83, %rd82, 2;
add.s64 %rd79, %rd36, %rd83;
@@ -311,12 +313,12 @@
}
$L__BB0_19:
- add.s32 %r542, %r13, %r543;
- mad.lo.s32 %r541, %r542, %r3, %r7;
+ add.s32 %r542, %r14, %r543;
+ mad.lo.s32 %r541, %r542, %r4, %r8;
setp.gt.s32 %p189, %r541, 63;
mov.f32 %f476, 0f00000000;
mov.f32 %f471, %f476;
@%p189 bra $L__BB0_21;
@@ -377,11 +379,11 @@
@%p28 bra $L__BB0_30;
mov.u32 %r548, %r17;
$L__BB0_27:
- setp.ge.u32 %p29, %r5, %r548;
+ setp.ge.u32 %p29, %r6, %r548;
@%p29 bra $L__BB0_29;
add.s32 %r332, %r548, %r15;
mul.wide.s32 %rd86, %r332, 4;
add.s64 %rd88, %rd50, %rd86;
@@ -396,15 +398,15 @@
setp.gt.u32 %p30, %r548, 3;
mov.u32 %r548, %r35;
@%p30 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p31, %r5, 0;
+ setp.ne.s32 %p31, %r6, 0;
mov.f32 %f478, 0f00000000;
@%p31 bra $L__BB0_33;
- setp.lt.u32 %p32, %r2, 2;
+ setp.lt.u32 %p32, %r3, 2;
ld.shared.f32 %f216, [%rd8];
add.f32 %f478, %f216, 0f00000000;
@%p32 bra $L__BB0_33;
ld.shared.f32 %f217, [%rd11];
@@ -427,11 +429,11 @@
@%p184 bra $L__BB0_40;
mov.u32 %r549, %r17;
$L__BB0_37:
- setp.ge.u32 %p35, %r5, %r549;
+ setp.ge.u32 %p35, %r6, %r549;
@%p35 bra $L__BB0_39;
add.s32 %r333, %r549, %r15;
mul.wide.s32 %rd89, %r333, 4;
add.s64 %rd91, %rd50, %rd89;
@@ -449,11 +451,11 @@
$L__BB0_40:
mov.f32 %f479, 0f00000000;
@%p31 bra $L__BB0_43;
- setp.lt.u32 %p38, %r2, 2;
+ setp.lt.u32 %p38, %r3, 2;
ld.shared.f32 %f225, [%rd8];
add.f32 %f479, %f225, 0f00000000;
@%p38 bra $L__BB0_43;
ld.shared.f32 %f226, [%rd11];
@@ -472,18 +474,18 @@
@%p31 bra $L__BB0_47;
st.shared.f32 [%rd13], %f479;
$L__BB0_47:
- add.s32 %r540, %r13, %r543;
- mad.lo.s32 %r539, %r540, %r3, %r7;
- setp.lt.s32 %p188, %r539, 64;
- and.pred %p187, %p1, %p188;
+ add.s32 %r538, %r14, %r543;
+ mad.lo.s32 %r537, %r538, %r4, %r8;
+ setp.lt.s32 %p186, %r537, 64;
+ and.pred %p185, %p1, %p186;
bar.sync 0;
ld.shared.f32 %f42, [%rd13];
bar.sync 0;
- @%p187 bra $L__BB0_50;
+ @%p185 bra $L__BB0_50;
bra.uni $L__BB0_48;
$L__BB0_50:
mul.f32 %f227, %f471, %f1;
@@ -546,13 +548,12 @@
sub.f32 %f293, %f291, %f292;
mul.f32 %f294, %f227, %f293;
add.f32 %f295, %f283, %f294;
mov.b32 %r345, %f295;
mov.b32 %r349, %f294;
- mad.lo.s32 %r350, %r543, %r3, %r10;
- mad.lo.s32 %r351, %r350, %r211, %r14;
- mul.wide.s32 %rd96, %r351, 4;
+ mad.lo.s32 %r350, %r33, %r211, %r7;
+ mul.wide.s32 %rd96, %r350, 4;
add.s64 %rd94, %rd41, %rd96;
st.global.cs.v4.s32 [%rd94], {%r342,%r343,%r344,%r345};
add.s64 %rd95, %rd43, %rd96;
@@ -568,11 +569,11 @@
ld.global.cs.v4.u32 {%r334,%r335,%r336,%r337}, [%rd14];
$L__BB0_51:
add.s32 %r543, %r543, 1;
- setp.lt.s32 %p42, %r543, %r9;
+ setp.lt.s32 %p42, %r543, %r10;
@%p42 bra $L__BB0_5;
bra.uni $L__BB0_52;
$L__BB0_3:
mov.f32 %f480, 0f00000000;
@@ -587,31 +588,31 @@
mov.f32 %f493, %f480;
mov.f32 %f494, %f480;
mov.f32 %f495, %f480;
$L__BB0_52:
- mov.u32 %r352, %tid.z;
- mad.lo.s32 %r353, %r3, %r352, %r7;
- mad.lo.s32 %r39, %r353, %r2, %r5;
+ mov.u32 %r351, %tid.z;
+ mad.lo.s32 %r352, %r4, %r351, %r8;
+ mad.lo.s32 %r39, %r352, %r3, %r6;
mul.wide.u32 %rd97, %r39, 4;
add.s64 %rd25, %rd50, %rd97;
- clz.b32 %r354, %r3;
- mov.u32 %r355, 31;
- sub.s32 %r356, %r355, %r354;
- mov.u32 %r357, 1;
- shl.b32 %r40, %r357, %r356;
- setp.lt.u32 %p43, %r7, %r40;
- add.s32 %r358, %r40, %r7;
- setp.lt.u32 %p44, %r358, %r3;
+ clz.b32 %r353, %r4;
+ mov.u32 %r354, 31;
+ sub.s32 %r355, %r354, %r353;
+ mov.u32 %r356, 1;
+ shl.b32 %r40, %r356, %r355;
+ setp.lt.u32 %p43, %r8, %r40;
+ add.s32 %r357, %r40, %r8;
+ setp.lt.u32 %p44, %r357, %r4;
and.pred %p5, %p43, %p44;
- shl.b32 %r359, %r2, %r356;
- add.s32 %r360, %r39, %r359;
- mul.wide.s32 %rd99, %r360, 4;
+ shl.b32 %r358, %r3, %r355;
+ add.s32 %r359, %r39, %r358;
+ mul.wide.s32 %rd99, %r359, 4;
add.s64 %rd26, %rd50, %rd99;
- shr.u32 %r361, %r40, 31;
- add.s32 %r362, %r40, %r361;
- shr.s32 %r572, %r362, 1;
+ shr.u32 %r360, %r40, 31;
+ add.s32 %r361, %r40, %r360;
+ shr.s32 %r572, %r361, 1;
st.shared.f32 [%rd25], %f480;
bar.sync 0;
not.pred %p45, %p5;
@%p45 bra $L__BB0_54;
@@ -626,15 +627,15 @@
@%p46 bra $L__BB0_59;
mov.u32 %r550, %r572;
$L__BB0_56:
- setp.ge.u32 %p47, %r7, %r550;
+ setp.ge.u32 %p47, %r8, %r550;
@%p47 bra $L__BB0_58;
- mad.lo.s32 %r363, %r550, %r2, %r39;
- mul.wide.s32 %rd100, %r363, 4;
+ mad.lo.s32 %r362, %r550, %r3, %r39;
+ mul.wide.s32 %rd100, %r362, 4;
add.s64 %rd102, %rd50, %rd100;
ld.shared.f32 %f299, [%rd25];
ld.shared.f32 %f300, [%rd102];
add.f32 %f301, %f300, %f299;
st.shared.f32 [%rd25], %f301;
@@ -645,18 +646,18 @@
setp.gt.u32 %p48, %r550, 3;
mov.u32 %r550, %r43;
@%p48 bra $L__BB0_56;
$L__BB0_59:
- add.s32 %r365, %r39, %r2;
- mul.wide.u32 %rd103, %r365, 4;
+ add.s32 %r364, %r39, %r3;
+ mul.wide.u32 %rd103, %r364, 4;
add.s64 %rd27, %rd50, %rd103;
- setp.ne.s32 %p49, %r7, 0;
+ setp.ne.s32 %p49, %r8, 0;
mov.u32 %r551, 0;
@%p49 bra $L__BB0_63;
- setp.lt.u32 %p50, %r3, 2;
+ setp.lt.u32 %p50, %r4, 2;
ld.shared.f32 %f302, [%rd25];
add.f32 %f496, %f302, 0f00000000;
@%p50 bra $L__BB0_62;
ld.shared.f32 %f303, [%rd27];
@@ -681,15 +682,15 @@
@%p46 bra $L__BB0_70;
mov.u32 %r552, %r572;
$L__BB0_67:
- setp.ge.u32 %p53, %r7, %r552;
+ setp.ge.u32 %p53, %r8, %r552;
@%p53 bra $L__BB0_69;
- mad.lo.s32 %r366, %r552, %r2, %r39;
- mul.wide.s32 %rd105, %r366, 4;
+ mad.lo.s32 %r365, %r552, %r3, %r39;
+ mul.wide.s32 %rd105, %r365, 4;
add.s64 %rd107, %rd50, %rd105;
ld.shared.f32 %f307, [%rd25];
ld.shared.f32 %f308, [%rd107];
add.f32 %f309, %f308, %f307;
st.shared.f32 [%rd25], %f309;
@@ -703,11 +704,11 @@
$L__BB0_70:
mov.u32 %r553, 0;
@%p49 bra $L__BB0_74;
- setp.lt.u32 %p56, %r3, 2;
+ setp.lt.u32 %p56, %r4, 2;
ld.shared.f32 %f310, [%rd25];
add.f32 %f497, %f310, 0f00000000;
@%p56 bra $L__BB0_73;
ld.shared.f32 %f311, [%rd27];
@@ -732,15 +733,15 @@
@%p46 bra $L__BB0_81;
mov.u32 %r554, %r572;
$L__BB0_78:
- setp.ge.u32 %p59, %r7, %r554;
+ setp.ge.u32 %p59, %r8, %r554;
@%p59 bra $L__BB0_80;
- mad.lo.s32 %r368, %r554, %r2, %r39;
- mul.wide.s32 %rd108, %r368, 4;
+ mad.lo.s32 %r367, %r554, %r3, %r39;
+ mul.wide.s32 %rd108, %r367, 4;
add.s64 %rd110, %rd50, %rd108;
ld.shared.f32 %f315, [%rd25];
ld.shared.f32 %f316, [%rd110];
add.f32 %f317, %f316, %f315;
st.shared.f32 [%rd25], %f317;
@@ -754,11 +755,11 @@
$L__BB0_81:
mov.u32 %r555, 0;
@%p49 bra $L__BB0_85;
- setp.lt.u32 %p62, %r3, 2;
+ setp.lt.u32 %p62, %r4, 2;
ld.shared.f32 %f318, [%rd25];
add.f32 %f498, %f318, 0f00000000;
@%p62 bra $L__BB0_84;
ld.shared.f32 %f319, [%rd27];
@@ -783,15 +784,15 @@
@%p46 bra $L__BB0_92;
mov.u32 %r556, %r572;
$L__BB0_89:
- setp.ge.u32 %p65, %r7, %r556;
+ setp.ge.u32 %p65, %r8, %r556;
@%p65 bra $L__BB0_91;
- mad.lo.s32 %r370, %r556, %r2, %r39;
- mul.wide.s32 %rd111, %r370, 4;
+ mad.lo.s32 %r369, %r556, %r3, %r39;
+ mul.wide.s32 %rd111, %r369, 4;
add.s64 %rd113, %rd50, %rd111;
ld.shared.f32 %f323, [%rd25];
ld.shared.f32 %f324, [%rd113];
add.f32 %f325, %f324, %f323;
st.shared.f32 [%rd25], %f325;
@@ -805,11 +806,11 @@
$L__BB0_92:
mov.u32 %r557, 0;
@%p49 bra $L__BB0_96;
- setp.lt.u32 %p68, %r3, 2;
+ setp.lt.u32 %p68, %r4, 2;
ld.shared.f32 %f326, [%rd25];
add.f32 %f499, %f326, 0f00000000;
@%p68 bra $L__BB0_95;
ld.shared.f32 %f327, [%rd27];
@@ -834,15 +835,15 @@
@%p46 bra $L__BB0_103;
mov.u32 %r558, %r572;
$L__BB0_100:
- setp.ge.u32 %p71, %r7, %r558;
+ setp.ge.u32 %p71, %r8, %r558;
@%p71 bra $L__BB0_102;
- mad.lo.s32 %r372, %r558, %r2, %r39;
- mul.wide.s32 %rd114, %r372, 4;
+ mad.lo.s32 %r371, %r558, %r3, %r39;
+ mul.wide.s32 %rd114, %r371, 4;
add.s64 %rd116, %rd50, %rd114;
ld.shared.f32 %f331, [%rd25];
ld.shared.f32 %f332, [%rd116];
add.f32 %f333, %f332, %f331;
st.shared.f32 [%rd25], %f333;
@@ -856,11 +857,11 @@
$L__BB0_103:
mov.u32 %r559, 0;
@%p49 bra $L__BB0_107;
- setp.lt.u32 %p74, %r3, 2;
+ setp.lt.u32 %p74, %r4, 2;
ld.shared.f32 %f334, [%rd25];
add.f32 %f500, %f334, 0f00000000;
@%p74 bra $L__BB0_106;
ld.shared.f32 %f335, [%rd27];
@@ -885,15 +886,15 @@
@%p46 bra $L__BB0_114;
mov.u32 %r560, %r572;
$L__BB0_111:
- setp.ge.u32 %p77, %r7, %r560;
+ setp.ge.u32 %p77, %r8, %r560;
@%p77 bra $L__BB0_113;
- mad.lo.s32 %r374, %r560, %r2, %r39;
- mul.wide.s32 %rd117, %r374, 4;
+ mad.lo.s32 %r373, %r560, %r3, %r39;
+ mul.wide.s32 %rd117, %r373, 4;
add.s64 %rd119, %rd50, %rd117;
ld.shared.f32 %f339, [%rd25];
ld.shared.f32 %f340, [%rd119];
add.f32 %f341, %f340, %f339;
st.shared.f32 [%rd25], %f341;
@@ -907,11 +908,11 @@
$L__BB0_114:
mov.u32 %r561, 0;
@%p49 bra $L__BB0_118;
- setp.lt.u32 %p80, %r3, 2;
+ setp.lt.u32 %p80, %r4, 2;
ld.shared.f32 %f342, [%rd25];
add.f32 %f501, %f342, 0f00000000;
@%p80 bra $L__BB0_117;
ld.shared.f32 %f343, [%rd27];
@@ -936,15 +937,15 @@
@%p46 bra $L__BB0_125;
mov.u32 %r562, %r572;
$L__BB0_122:
- setp.ge.u32 %p83, %r7, %r562;
+ setp.ge.u32 %p83, %r8, %r562;
@%p83 bra $L__BB0_124;
- mad.lo.s32 %r376, %r562, %r2, %r39;
- mul.wide.s32 %rd120, %r376, 4;
+ mad.lo.s32 %r375, %r562, %r3, %r39;
+ mul.wide.s32 %rd120, %r375, 4;
add.s64 %rd122, %rd50, %rd120;
ld.shared.f32 %f347, [%rd25];
ld.shared.f32 %f348, [%rd122];
add.f32 %f349, %f348, %f347;
st.shared.f32 [%rd25], %f349;
@@ -958,11 +959,11 @@
$L__BB0_125:
mov.u32 %r563, 0;
@%p49 bra $L__BB0_129;
- setp.lt.u32 %p86, %r3, 2;
+ setp.lt.u32 %p86, %r4, 2;
ld.shared.f32 %f350, [%rd25];
add.f32 %f502, %f350, 0f00000000;
@%p86 bra $L__BB0_128;
ld.shared.f32 %f351, [%rd27];
@@ -987,15 +988,15 @@
@%p46 bra $L__BB0_136;
mov.u32 %r564, %r572;
$L__BB0_133:
- setp.ge.u32 %p89, %r7, %r564;
+ setp.ge.u32 %p89, %r8, %r564;
@%p89 bra $L__BB0_135;
- mad.lo.s32 %r378, %r564, %r2, %r39;
- mul.wide.s32 %rd123, %r378, 4;
+ mad.lo.s32 %r377, %r564, %r3, %r39;
+ mul.wide.s32 %rd123, %r377, 4;
add.s64 %rd125, %rd50, %rd123;
ld.shared.f32 %f355, [%rd25];
ld.shared.f32 %f356, [%rd125];
add.f32 %f357, %f356, %f355;
st.shared.f32 [%rd25], %f357;
@@ -1009,11 +1010,11 @@
$L__BB0_136:
mov.u32 %r565, 0;
@%p49 bra $L__BB0_140;
- setp.lt.u32 %p92, %r3, 2;
+ setp.lt.u32 %p92, %r4, 2;
ld.shared.f32 %f358, [%rd25];
add.f32 %f503, %f358, 0f00000000;
@%p92 bra $L__BB0_139;
ld.shared.f32 %f359, [%rd27];
@@ -1038,15 +1039,15 @@
@%p46 bra $L__BB0_147;
mov.u32 %r566, %r572;
$L__BB0_144:
- setp.ge.u32 %p95, %r7, %r566;
+ setp.ge.u32 %p95, %r8, %r566;
@%p95 bra $L__BB0_146;
- mad.lo.s32 %r380, %r566, %r2, %r39;
- mul.wide.s32 %rd126, %r380, 4;
+ mad.lo.s32 %r379, %r566, %r3, %r39;
+ mul.wide.s32 %rd126, %r379, 4;
add.s64 %rd128, %rd50, %rd126;
ld.shared.f32 %f363, [%rd25];
ld.shared.f32 %f364, [%rd128];
add.f32 %f365, %f364, %f363;
st.shared.f32 [%rd25], %f365;
@@ -1060,11 +1061,11 @@
$L__BB0_147:
mov.u32 %r567, 0;
@%p49 bra $L__BB0_151;
- setp.lt.u32 %p98, %r3, 2;
+ setp.lt.u32 %p98, %r4, 2;
ld.shared.f32 %f366, [%rd25];
add.f32 %f504, %f366, 0f00000000;
@%p98 bra $L__BB0_150;
ld.shared.f32 %f367, [%rd27];
@@ -1089,15 +1090,15 @@
@%p46 bra $L__BB0_158;
mov.u32 %r568, %r572;
$L__BB0_155:
- setp.ge.u32 %p101, %r7, %r568;
+ setp.ge.u32 %p101, %r8, %r568;
@%p101 bra $L__BB0_157;
- mad.lo.s32 %r382, %r568, %r2, %r39;
- mul.wide.s32 %rd129, %r382, 4;
+ mad.lo.s32 %r381, %r568, %r3, %r39;
+ mul.wide.s32 %rd129, %r381, 4;
add.s64 %rd131, %rd50, %rd129;
ld.shared.f32 %f371, [%rd25];
ld.shared.f32 %f372, [%rd131];
add.f32 %f373, %f372, %f371;
st.shared.f32 [%rd25], %f373;
@@ -1111,11 +1112,11 @@
$L__BB0_158:
mov.u32 %r569, 0;
@%p49 bra $L__BB0_162;
- setp.lt.u32 %p104, %r3, 2;
+ setp.lt.u32 %p104, %r4, 2;
ld.shared.f32 %f374, [%rd25];
add.f32 %f505, %f374, 0f00000000;
@%p104 bra $L__BB0_161;
ld.shared.f32 %f375, [%rd27];
@@ -1140,15 +1141,15 @@
@%p46 bra $L__BB0_169;
mov.u32 %r570, %r572;
$L__BB0_166:
- setp.ge.u32 %p107, %r7, %r570;
+ setp.ge.u32 %p107, %r8, %r570;
@%p107 bra $L__BB0_168;
- mad.lo.s32 %r384, %r570, %r2, %r39;
- mul.wide.s32 %rd132, %r384, 4;
+ mad.lo.s32 %r383, %r570, %r3, %r39;
+ mul.wide.s32 %rd132, %r383, 4;
add.s64 %rd134, %rd50, %rd132;
ld.shared.f32 %f379, [%rd25];
ld.shared.f32 %f380, [%rd134];
add.f32 %f381, %f380, %f379;
st.shared.f32 [%rd25], %f381;
@@ -1162,11 +1163,11 @@
$L__BB0_169:
mov.u32 %r571, 0;
@%p49 bra $L__BB0_173;
- setp.lt.u32 %p110, %r3, 2;
+ setp.lt.u32 %p110, %r4, 2;
ld.shared.f32 %f382, [%rd25];
add.f32 %f506, %f382, 0f00000000;
@%p110 bra $L__BB0_172;
ld.shared.f32 %f383, [%rd27];
@@ -1189,15 +1190,15 @@
$L__BB0_175:
bar.sync 0;
@%p46 bra $L__BB0_179;
$L__BB0_176:
- setp.ge.u32 %p113, %r7, %r572;
+ setp.ge.u32 %p113, %r8, %r572;
@%p113 bra $L__BB0_178;
- mad.lo.s32 %r386, %r572, %r2, %r39;
- mul.wide.s32 %rd135, %r386, 4;
+ mad.lo.s32 %r385, %r572, %r3, %r39;
+ mul.wide.s32 %rd135, %r385, 4;
add.s64 %rd137, %rd50, %rd135;
ld.shared.f32 %f387, [%rd25];
ld.shared.f32 %f388, [%rd137];
add.f32 %f389, %f388, %f387;
st.shared.f32 [%rd25], %f389;
@@ -1211,11 +1212,11 @@
$L__BB0_179:
mov.u32 %r573, 0;
@%p49 bra $L__BB0_183;
- setp.lt.u32 %p116, %r3, 2;
+ setp.lt.u32 %p116, %r4, 2;
ld.shared.f32 %f390, [%rd25];
add.f32 %f507, %f390, 0f00000000;
@%p116 bra $L__BB0_182;
ld.shared.f32 %f391, [%rd27];
@@ -1223,21 +1224,20 @@
$L__BB0_182:
mov.b32 %r573, %f507;
$L__BB0_183:
- setp.eq.s32 %p186, %r7, 0;
- and.pred %p185, %p186, %p1;
- bar.sync 0;
- @%p185 bra $L__BB0_184;
+ setp.eq.s32 %p188, %r8, 0;
+ and.pred %p187, %p188, %p1;
+ bar.sync 0;
+ @%p187 bra $L__BB0_184;
bra.uni $L__BB0_185;
$L__BB0_184:
- shl.b32 %r538, %r5, 2;
- mov.u32 %r400, %ctaid.y;
- mad.lo.s32 %r401, %r211, %r400, %r538;
- mul.wide.s32 %rd141, %r401, 4;
+ mov.u32 %r399, %ctaid.y;
+ mad.lo.s32 %r400, %r211, %r399, %r7;
+ mul.wide.s32 %rd141, %r400, 4;
add.s64 %rd138, %rd46, %rd141;
st.volatile.global.v4.s32 [%rd138], {%r551,%r553,%r555,%r557};
add.s64 %rd139, %rd47, %rd141;
@@ -1251,26 +1251,26 @@
$L__BB0_185:
mov.u32 %r90, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r402, %r5, %r7;
- or.b32 %r404, %r402, %r352;
- setp.ne.s32 %p117, %r404, 0;
+ or.b32 %r401, %r6, %r8;
+ or.b32 %r403, %r401, %r351;
+ setp.ne.s32 %p117, %r403, 0;
@%p117 bra $L__BB0_189;
- ld.param.u64 %rd188, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_14];
- cvta.to.global.u64 %rd142, %rd188;
- mov.u32 %r405, %ctaid.x;
- mov.u32 %r406, %ctaid.z;
- mov.u32 %r407, %nctaid.x;
- mad.lo.s32 %r408, %r406, %r407, %r405;
- mul.wide.s32 %rd143, %r408, 8;
+ ld.param.u64 %rd189, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_14];
+ cvta.to.global.u64 %rd142, %rd189;
+ mov.u32 %r404, %ctaid.x;
+ mov.u32 %r405, %ctaid.z;
+ mov.u32 %r406, %nctaid.x;
+ mad.lo.s32 %r407, %r405, %r406, %r404;
+ mul.wide.s32 %rd143, %r407, 8;
add.s64 %rd31, %rd142, %rd143;
- add.s32 %r409, %r8, -1;
- setp.eq.s32 %p118, %r90, %r409;
- cvt.s64.s32 %rd144, %r8;
+ add.s32 %r408, %r9, -1;
+ setp.eq.s32 %p118, %r90, %r408;
+ cvt.s64.s32 %rd144, %r9;
mov.u64 %rd145, -9223372036854775807;
sub.s64 %rd146, %rd145, %rd144;
selp.b64 %rd147, %rd146, 1, %p118;
atom.global.add.u64 %rd32, [%rd31], %rd147;
ld.volatile.global.u64 %rd148, [%rd31];
@@ -1283,60 +1283,60 @@
$L__BB0_188:
nanosleep.u32 %r574;
setp.lt.u32 %p120, %r574, 256;
- selp.u32 %r412, 1, 0, %p120;
- shl.b32 %r574, %r574, %r412;
+ selp.u32 %r411, 1, 0, %p120;
+ shl.b32 %r574, %r574, %r411;
ld.volatile.global.u64 %rd150, [%rd31];
xor.b64 %rd151, %rd150, %rd32;
setp.gt.s64 %p121, %rd151, -1;
@%p121 bra $L__BB0_188;
$L__BB0_189:
bar.sync 0;
- add.s32 %r413, %r8, %r2;
- add.s32 %r414, %r413, -1;
- div.s32 %r93, %r414, %r2;
+ add.s32 %r412, %r9, %r3;
+ add.s32 %r413, %r412, -1;
+ div.s32 %r93, %r413, %r3;
setp.lt.s32 %p122, %r93, 1;
mov.f32 %f510, 0f00000000;
mov.f32 %f511, %f510;
@%p122 bra $L__BB0_195;
- add.s32 %r416, %r211, 1;
- shr.u32 %r417, %r416, 31;
- add.s32 %r418, %r416, %r417;
- shr.s32 %r419, %r418, 1;
- add.s32 %r420, %r3, %r419;
- add.s32 %r421, %r420, -1;
- shl.b32 %r422, %r7, 1;
- shl.b32 %r423, %r3, 1;
- mad.lo.s32 %r424, %r423, %r90, %r422;
- or.b32 %r425, %r424, 1;
- setp.ge.s32 %p123, %r425, %r211;
- div.s32 %r426, %r421, %r3;
- setp.ge.s32 %p124, %r90, %r426;
+ add.s32 %r415, %r211, 1;
+ shr.u32 %r416, %r415, 31;
+ add.s32 %r417, %r415, %r416;
+ shr.s32 %r418, %r417, 1;
+ add.s32 %r419, %r4, %r418;
+ add.s32 %r420, %r419, -1;
+ shl.b32 %r421, %r8, 1;
+ shl.b32 %r422, %r4, 1;
+ mad.lo.s32 %r423, %r422, %r90, %r421;
+ or.b32 %r424, %r423, 1;
+ setp.ge.s32 %p123, %r424, %r211;
+ div.s32 %r425, %r420, %r4;
+ setp.ge.s32 %p124, %r90, %r425;
or.pred %p6, %p124, %p123;
- mul.lo.s32 %r427, %r3, %r90;
- shl.b32 %r428, %r427, 1;
- mad.lo.s32 %r429, %r211, %r5, %r428;
- add.s32 %r576, %r429, %r422;
- mul.lo.s32 %r95, %r211, %r2;
- mov.u32 %r415, 0;
+ mul.lo.s32 %r426, %r4, %r90;
+ shl.b32 %r427, %r426, 1;
+ mad.lo.s32 %r428, %r211, %r6, %r427;
+ add.s32 %r576, %r428, %r421;
+ mul.lo.s32 %r95, %r211, %r3;
+ mov.u32 %r414, 0;
mov.f32 %f510, 0f00000000;
- mov.u32 %r575, %r5;
- mov.u32 %r577, %r415;
+ mov.u32 %r575, %r6;
+ mov.u32 %r577, %r414;
$L__BB0_191:
.pragma "nounroll";
- mov.u32 %r578, %r415;
- mov.u32 %r579, %r415;
+ mov.u32 %r578, %r414;
+ mov.u32 %r579, %r414;
@%p6 bra $L__BB0_194;
- setp.ge.s32 %p125, %r575, %r8;
- mov.u32 %r578, %r415;
- mov.u32 %r579, %r415;
+ setp.ge.s32 %p125, %r575, %r9;
+ mov.u32 %r578, %r414;
+ mov.u32 %r579, %r414;
@%p125 bra $L__BB0_194;
mul.wide.s32 %rd153, %r576, 4;
add.s64 %rd152, %rd48, %rd153;
@@ -1347,31 +1347,31 @@
mov.b32 %f396, %r579;
add.f32 %f510, %f510, %f396;
mov.b32 %f397, %r578;
add.f32 %f511, %f511, %f397;
add.s32 %r576, %r576, %r95;
- add.s32 %r575, %r575, %r2;
+ add.s32 %r575, %r575, %r3;
add.s32 %r577, %r577, 1;
setp.lt.s32 %p126, %r577, %r93;
@%p126 bra $L__BB0_191;
$L__BB0_195:
- clz.b32 %r436, %r2;
- mov.u32 %r437, 31;
- sub.s32 %r438, %r437, %r436;
- mov.u32 %r439, 1;
- shl.b32 %r106, %r439, %r438;
- setp.lt.u32 %p127, %r5, %r106;
- add.s32 %r440, %r106, %r5;
- setp.lt.u32 %p128, %r440, %r2;
+ clz.b32 %r435, %r3;
+ mov.u32 %r436, 31;
+ sub.s32 %r437, %r436, %r435;
+ mov.u32 %r438, 1;
+ shl.b32 %r106, %r438, %r437;
+ setp.lt.u32 %p127, %r6, %r106;
+ add.s32 %r439, %r106, %r6;
+ setp.lt.u32 %p128, %r439, %r3;
and.pred %p7, %p127, %p128;
- add.s32 %r441, %r39, %r106;
- mul.wide.s32 %rd154, %r441, 4;
+ add.s32 %r440, %r39, %r106;
+ mul.wide.s32 %rd154, %r440, 4;
add.s64 %rd33, %rd50, %rd154;
- shr.u32 %r442, %r106, 31;
- add.s32 %r443, %r106, %r442;
- shr.s32 %r600, %r443, 1;
+ shr.u32 %r441, %r106, 31;
+ add.s32 %r442, %r106, %r441;
+ shr.s32 %r600, %r442, 1;
st.shared.f32 [%rd25], %f510;
bar.sync 0;
not.pred %p129, %p7;
@%p129 bra $L__BB0_197;
@@ -1386,15 +1386,15 @@
@%p130 bra $L__BB0_202;
mov.u32 %r580, %r600;
$L__BB0_199:
- setp.ge.u32 %p131, %r5, %r580;
+ setp.ge.u32 %p131, %r6, %r580;
@%p131 bra $L__BB0_201;
- add.s32 %r444, %r580, %r39;
- mul.wide.s32 %rd156, %r444, 4;
+ add.s32 %r443, %r580, %r39;
+ mul.wide.s32 %rd156, %r443, 4;
add.s64 %rd158, %rd50, %rd156;
ld.shared.f32 %f401, [%rd25];
ld.shared.f32 %f402, [%rd158];
add.f32 %f403, %f402, %f401;
st.shared.f32 [%rd25], %f403;
@@ -1405,18 +1405,18 @@
setp.gt.u32 %p132, %r580, 3;
mov.u32 %r580, %r109;
@%p132 bra $L__BB0_199;
$L__BB0_202:
- add.s32 %r446, %r39, 1;
- mul.wide.u32 %rd159, %r446, 4;
+ add.s32 %r445, %r39, 1;
+ mul.wide.u32 %rd159, %r445, 4;
add.s64 %rd34, %rd50, %rd159;
- setp.ne.s32 %p133, %r5, 0;
+ setp.ne.s32 %p133, %r6, 0;
mov.u32 %r581, 0;
@%p133 bra $L__BB0_206;
- setp.lt.u32 %p134, %r2, 2;
+ setp.lt.u32 %p134, %r3, 2;
ld.shared.f32 %f404, [%rd25];
add.f32 %f512, %f404, 0f00000000;
@%p134 bra $L__BB0_205;
ld.shared.f32 %f405, [%rd34];
@@ -1441,15 +1441,15 @@
@%p130 bra $L__BB0_213;
mov.u32 %r582, %r600;
$L__BB0_210:
- setp.ge.u32 %p137, %r5, %r582;
+ setp.ge.u32 %p137, %r6, %r582;
@%p137 bra $L__BB0_212;
- add.s32 %r447, %r582, %r39;
- mul.wide.s32 %rd161, %r447, 4;
+ add.s32 %r446, %r582, %r39;
+ mul.wide.s32 %rd161, %r446, 4;
add.s64 %rd163, %rd50, %rd161;
ld.shared.f32 %f409, [%rd25];
ld.shared.f32 %f410, [%rd163];
add.f32 %f411, %f410, %f409;
st.shared.f32 [%rd25], %f411;
@@ -1463,11 +1463,11 @@
$L__BB0_213:
mov.u32 %r583, 0;
@%p133 bra $L__BB0_217;
- setp.lt.u32 %p140, %r2, 2;
+ setp.lt.u32 %p140, %r3, 2;
ld.shared.f32 %f412, [%rd25];
add.f32 %f513, %f412, 0f00000000;
@%p140 bra $L__BB0_216;
ld.shared.f32 %f413, [%rd34];
@@ -1478,74 +1478,74 @@
$L__BB0_217:
bar.sync 0;
@%p133 bra $L__BB0_221;
- add.s32 %r449, %r211, 1;
- shr.u32 %r450, %r449, 31;
- add.s32 %r451, %r449, %r450;
- shr.s32 %r452, %r451, 1;
- add.s32 %r453, %r3, %r452;
- add.s32 %r454, %r453, -1;
- div.s32 %r455, %r454, %r3;
- setp.ge.s32 %p142, %r90, %r455;
+ add.s32 %r448, %r211, 1;
+ shr.u32 %r449, %r448, 31;
+ add.s32 %r450, %r448, %r449;
+ shr.s32 %r451, %r450, 1;
+ add.s32 %r452, %r4, %r451;
+ add.s32 %r453, %r452, -1;
+ div.s32 %r454, %r453, %r4;
+ setp.ge.s32 %p142, %r90, %r454;
@%p142 bra $L__BB0_221;
- shl.b32 %r116, %r7, 1;
- mul.lo.s32 %r456, %r3, %r90;
- shl.b32 %r117, %r456, 1;
- add.s32 %r457, %r116, %r117;
- or.b32 %r458, %r457, 1;
- setp.ge.s32 %p143, %r458, %r211;
+ shl.b32 %r116, %r8, 1;
+ mul.lo.s32 %r455, %r4, %r90;
+ shl.b32 %r117, %r455, 1;
+ add.s32 %r456, %r116, %r117;
+ or.b32 %r457, %r456, 1;
+ setp.ge.s32 %p143, %r457, %r211;
@%p143 bra $L__BB0_221;
- ld.param.u64 %rd187, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10];
- add.s32 %r461, %r117, %r116;
- mul.wide.s32 %rd165, %r461, 4;
- add.s64 %rd164, %rd187, %rd165;
+ ld.param.u64 %rd188, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_10];
+ add.s32 %r460, %r117, %r116;
+ mul.wide.s32 %rd165, %r460, 4;
+ add.s64 %rd164, %rd188, %rd165;
st.global.cs.v2.s32 [%rd164], {%r581,%r583};
$L__BB0_221:
mov.f32 %f516, 0f00000000;
mov.f32 %f517, %f516;
@%p122 bra $L__BB0_227;
- add.s32 %r463, %r211, 1;
- shr.u32 %r464, %r463, 31;
- add.s32 %r465, %r463, %r464;
- shr.s32 %r466, %r465, 1;
- add.s32 %r467, %r3, %r466;
- add.s32 %r468, %r467, -1;
- shl.b32 %r469, %r7, 1;
- shl.b32 %r470, %r3, 1;
- mad.lo.s32 %r471, %r470, %r90, %r469;
- or.b32 %r472, %r471, 1;
- setp.ge.s32 %p145, %r472, %r211;
- div.s32 %r473, %r468, %r3;
- setp.ge.s32 %p146, %r90, %r473;
+ add.s32 %r462, %r211, 1;
+ shr.u32 %r463, %r462, 31;
+ add.s32 %r464, %r462, %r463;
+ shr.s32 %r465, %r464, 1;
+ add.s32 %r466, %r4, %r465;
+ add.s32 %r467, %r466, -1;
+ shl.b32 %r468, %r8, 1;
+ shl.b32 %r469, %r4, 1;
+ mad.lo.s32 %r470, %r469, %r90, %r468;
+ or.b32 %r471, %r470, 1;
+ setp.ge.s32 %p145, %r471, %r211;
+ div.s32 %r472, %r467, %r4;
+ setp.ge.s32 %p146, %r90, %r472;
or.pred %p8, %p146, %p145;
- mul.lo.s32 %r474, %r3, %r90;
- shl.b32 %r475, %r474, 1;
- mad.lo.s32 %r476, %r211, %r5, %r475;
- add.s32 %r585, %r476, %r469;
- mul.lo.s32 %r119, %r211, %r2;
- mov.u32 %r462, 0;
+ mul.lo.s32 %r473, %r4, %r90;
+ shl.b32 %r474, %r473, 1;
+ mad.lo.s32 %r475, %r211, %r6, %r474;
+ add.s32 %r585, %r475, %r468;
+ mul.lo.s32 %r119, %r211, %r3;
+ mov.u32 %r461, 0;
mov.f32 %f516, 0f00000000;
- mov.u32 %r584, %r5;
- mov.u32 %r586, %r462;
+ mov.u32 %r584, %r6;
+ mov.u32 %r586, %r461;
$L__BB0_223:
.pragma "nounroll";
- mov.u32 %r587, %r462;
- mov.u32 %r588, %r462;
+ mov.u32 %r587, %r461;
+ mov.u32 %r588, %r461;
@%p8 bra $L__BB0_226;
- setp.ge.s32 %p147, %r584, %r8;
- mov.u32 %r587, %r462;
- mov.u32 %r588, %r462;
+ setp.ge.s32 %p147, %r584, %r9;
+ mov.u32 %r587, %r461;
+ mov.u32 %r588, %r461;
@%p147 bra $L__BB0_226;
mul.wide.s32 %rd167, %r585, 4;
add.s64 %rd166, %rd47, %rd167;
@@ -1556,11 +1556,11 @@
mov.b32 %f418, %r588;
add.f32 %f516, %f516, %f418;
mov.b32 %f419, %r587;
add.f32 %f517, %f517, %f419;
add.s32 %r585, %r585, %r119;
- add.s32 %r584, %r584, %r2;
+ add.s32 %r584, %r584, %r3;
add.s32 %r586, %r586, 1;
setp.lt.s32 %p148, %r586, %r93;
@%p148 bra $L__BB0_223;
$L__BB0_227:
@@ -1578,15 +1578,15 @@
@%p130 bra $L__BB0_234;
mov.u32 %r589, %r600;
$L__BB0_231:
- setp.ge.u32 %p151, %r5, %r589;
+ setp.ge.u32 %p151, %r6, %r589;
@%p151 bra $L__BB0_233;
- add.s32 %r483, %r589, %r39;
- mul.wide.s32 %rd168, %r483, 4;
+ add.s32 %r482, %r589, %r39;
+ mul.wide.s32 %rd168, %r482, 4;
add.s64 %rd170, %rd50, %rd168;
ld.shared.f32 %f423, [%rd25];
ld.shared.f32 %f424, [%rd170];
add.f32 %f425, %f424, %f423;
st.shared.f32 [%rd25], %f425;
@@ -1600,11 +1600,11 @@
$L__BB0_234:
mov.u32 %r590, 0;
@%p133 bra $L__BB0_238;
- setp.lt.u32 %p154, %r2, 2;
+ setp.lt.u32 %p154, %r3, 2;
ld.shared.f32 %f426, [%rd25];
add.f32 %f518, %f426, 0f00000000;
@%p154 bra $L__BB0_237;
ld.shared.f32 %f427, [%rd34];
@@ -1629,15 +1629,15 @@
@%p130 bra $L__BB0_245;
mov.u32 %r591, %r600;
$L__BB0_242:
- setp.ge.u32 %p157, %r5, %r591;
+ setp.ge.u32 %p157, %r6, %r591;
@%p157 bra $L__BB0_244;
- add.s32 %r485, %r591, %r39;
- mul.wide.s32 %rd171, %r485, 4;
+ add.s32 %r484, %r591, %r39;
+ mul.wide.s32 %rd171, %r484, 4;
add.s64 %rd173, %rd50, %rd171;
ld.shared.f32 %f431, [%rd25];
ld.shared.f32 %f432, [%rd173];
add.f32 %f433, %f432, %f431;
st.shared.f32 [%rd25], %f433;
@@ -1651,11 +1651,11 @@
$L__BB0_245:
mov.u32 %r592, 0;
@%p133 bra $L__BB0_249;
- setp.lt.u32 %p160, %r2, 2;
+ setp.lt.u32 %p160, %r3, 2;
ld.shared.f32 %f434, [%rd25];
add.f32 %f519, %f434, 0f00000000;
@%p160 bra $L__BB0_248;
ld.shared.f32 %f435, [%rd34];
@@ -1666,74 +1666,74 @@
$L__BB0_249:
bar.sync 0;
@%p133 bra $L__BB0_253;
- add.s32 %r487, %r211, 1;
- shr.u32 %r488, %r487, 31;
- add.s32 %r489, %r487, %r488;
- shr.s32 %r490, %r489, 1;
- add.s32 %r491, %r3, %r490;
- add.s32 %r492, %r491, -1;
- div.s32 %r493, %r492, %r3;
- setp.ge.s32 %p162, %r90, %r493;
+ add.s32 %r486, %r211, 1;
+ shr.u32 %r487, %r486, 31;
+ add.s32 %r488, %r486, %r487;
+ shr.s32 %r489, %r488, 1;
+ add.s32 %r490, %r4, %r489;
+ add.s32 %r491, %r490, -1;
+ div.s32 %r492, %r491, %r4;
+ setp.ge.s32 %p162, %r90, %r492;
@%p162 bra $L__BB0_253;
- shl.b32 %r138, %r7, 1;
- mul.lo.s32 %r494, %r3, %r90;
- shl.b32 %r139, %r494, 1;
- add.s32 %r495, %r138, %r139;
- or.b32 %r496, %r495, 1;
- setp.ge.s32 %p163, %r496, %r211;
+ shl.b32 %r138, %r8, 1;
+ mul.lo.s32 %r493, %r4, %r90;
+ shl.b32 %r139, %r493, 1;
+ add.s32 %r494, %r138, %r139;
+ or.b32 %r495, %r494, 1;
+ setp.ge.s32 %p163, %r495, %r211;
@%p163 bra $L__BB0_253;
- ld.param.u64 %rd186, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9];
- add.s32 %r499, %r139, %r138;
- mul.wide.s32 %rd175, %r499, 4;
- add.s64 %rd174, %rd186, %rd175;
+ ld.param.u64 %rd187, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_9];
+ add.s32 %r498, %r139, %r138;
+ mul.wide.s32 %rd175, %r498, 4;
+ add.s64 %rd174, %rd187, %rd175;
st.global.cs.v2.s32 [%rd174], {%r590,%r592};
$L__BB0_253:
mov.f32 %f522, 0f00000000;
mov.f32 %f523, %f522;
@%p122 bra $L__BB0_259;
- add.s32 %r501, %r211, 1;
- shr.u32 %r502, %r501, 31;
- add.s32 %r503, %r501, %r502;
- shr.s32 %r504, %r503, 1;
- add.s32 %r505, %r3, %r504;
- add.s32 %r506, %r505, -1;
- shl.b32 %r507, %r7, 1;
- shl.b32 %r508, %r3, 1;
- mad.lo.s32 %r509, %r508, %r90, %r507;
- or.b32 %r510, %r509, 1;
- setp.ge.s32 %p165, %r510, %r211;
- div.s32 %r511, %r506, %r3;
- setp.ge.s32 %p166, %r90, %r511;
+ add.s32 %r500, %r211, 1;
+ shr.u32 %r501, %r500, 31;
+ add.s32 %r502, %r500, %r501;
+ shr.s32 %r503, %r502, 1;
+ add.s32 %r504, %r4, %r503;
+ add.s32 %r505, %r504, -1;
+ shl.b32 %r506, %r8, 1;
+ shl.b32 %r507, %r4, 1;
+ mad.lo.s32 %r508, %r507, %r90, %r506;
+ or.b32 %r509, %r508, 1;
+ setp.ge.s32 %p165, %r509, %r211;
+ div.s32 %r510, %r505, %r4;
+ setp.ge.s32 %p166, %r90, %r510;
or.pred %p9, %p166, %p165;
- mul.lo.s32 %r512, %r3, %r90;
- shl.b32 %r513, %r512, 1;
- mad.lo.s32 %r514, %r211, %r5, %r513;
- add.s32 %r594, %r514, %r507;
- mul.lo.s32 %r141, %r211, %r2;
- mov.u32 %r500, 0;
+ mul.lo.s32 %r511, %r4, %r90;
+ shl.b32 %r512, %r511, 1;
+ mad.lo.s32 %r513, %r211, %r6, %r512;
+ add.s32 %r594, %r513, %r506;
+ mul.lo.s32 %r141, %r211, %r3;
+ mov.u32 %r499, 0;
mov.f32 %f522, 0f00000000;
- mov.u32 %r593, %r5;
- mov.u32 %r595, %r500;
+ mov.u32 %r593, %r6;
+ mov.u32 %r595, %r499;
$L__BB0_255:
.pragma "nounroll";
- mov.u32 %r596, %r500;
- mov.u32 %r597, %r500;
+ mov.u32 %r596, %r499;
+ mov.u32 %r597, %r499;
@%p9 bra $L__BB0_258;
- setp.ge.s32 %p167, %r593, %r8;
- mov.u32 %r596, %r500;
- mov.u32 %r597, %r500;
+ setp.ge.s32 %p167, %r593, %r9;
+ mov.u32 %r596, %r499;
+ mov.u32 %r597, %r499;
@%p167 bra $L__BB0_258;
mul.wide.s32 %rd177, %r594, 4;
add.s64 %rd176, %rd46, %rd177;
@@ -1744,11 +1744,11 @@
mov.b32 %f440, %r597;
add.f32 %f522, %f522, %f440;
mov.b32 %f441, %r596;
add.f32 %f523, %f523, %f441;
add.s32 %r594, %r594, %r141;
- add.s32 %r593, %r593, %r2;
+ add.s32 %r593, %r593, %r3;
add.s32 %r595, %r595, 1;
setp.lt.s32 %p168, %r595, %r93;
@%p168 bra $L__BB0_255;
$L__BB0_259:
@@ -1766,15 +1766,15 @@
@%p130 bra $L__BB0_266;
mov.u32 %r598, %r600;
$L__BB0_263:
- setp.ge.u32 %p171, %r5, %r598;
+ setp.ge.u32 %p171, %r6, %r598;
@%p171 bra $L__BB0_265;
- add.s32 %r521, %r598, %r39;
- mul.wide.s32 %rd178, %r521, 4;
+ add.s32 %r520, %r598, %r39;
+ mul.wide.s32 %rd178, %r520, 4;
add.s64 %rd180, %rd50, %rd178;
ld.shared.f32 %f445, [%rd25];
ld.shared.f32 %f446, [%rd180];
add.f32 %f447, %f446, %f445;
st.shared.f32 [%rd25], %f447;
@@ -1788,11 +1788,11 @@
$L__BB0_266:
mov.u32 %r599, 0;
@%p133 bra $L__BB0_270;
- setp.lt.u32 %p174, %r2, 2;
+ setp.lt.u32 %p174, %r3, 2;
ld.shared.f32 %f448, [%rd25];
add.f32 %f524, %f448, 0f00000000;
@%p174 bra $L__BB0_269;
ld.shared.f32 %f449, [%rd34];
@@ -1815,15 +1815,15 @@
$L__BB0_272:
bar.sync 0;
@%p130 bra $L__BB0_276;
$L__BB0_273:
- setp.ge.u32 %p177, %r5, %r600;
+ setp.ge.u32 %p177, %r6, %r600;
@%p177 bra $L__BB0_275;
- add.s32 %r523, %r600, %r39;
- mul.wide.s32 %rd181, %r523, 4;
+ add.s32 %r522, %r600, %r39;
+ mul.wide.s32 %rd181, %r522, 4;
add.s64 %rd183, %rd50, %rd181;
ld.shared.f32 %f453, [%rd25];
ld.shared.f32 %f454, [%rd183];
add.f32 %f455, %f454, %f453;
st.shared.f32 [%rd25], %f455;
@@ -1837,11 +1837,11 @@
$L__BB0_276:
mov.u32 %r601, 0;
@%p133 bra $L__BB0_280;
- setp.lt.u32 %p180, %r2, 2;
+ setp.lt.u32 %p180, %r3, 2;
ld.shared.f32 %f456, [%rd25];
add.f32 %f525, %f456, 0f00000000;
@%p180 bra $L__BB0_279;
ld.shared.f32 %f457, [%rd34];
@@ -1852,32 +1852,32 @@
$L__BB0_280:
bar.sync 0;
@%p133 bra $L__BB0_284;
- add.s32 %r525, %r211, 1;
- shr.u32 %r526, %r525, 31;
- add.s32 %r527, %r525, %r526;
- shr.s32 %r528, %r527, 1;
- add.s32 %r529, %r3, %r528;
- add.s32 %r530, %r529, -1;
- div.s32 %r531, %r530, %r3;
- setp.ge.s32 %p182, %r90, %r531;
+ add.s32 %r524, %r211, 1;
+ shr.u32 %r525, %r524, 31;
+ add.s32 %r526, %r524, %r525;
+ shr.s32 %r527, %r526, 1;
+ add.s32 %r528, %r4, %r527;
+ add.s32 %r529, %r528, -1;
+ div.s32 %r530, %r529, %r4;
+ setp.ge.s32 %p182, %r90, %r530;
@%p182 bra $L__BB0_284;
- shl.b32 %r160, %r7, 1;
- mul.lo.s32 %r532, %r3, %r90;
- shl.b32 %r161, %r532, 1;
- add.s32 %r533, %r160, %r161;
- or.b32 %r534, %r533, 1;
- setp.ge.s32 %p183, %r534, %r211;
+ shl.b32 %r160, %r8, 1;
+ mul.lo.s32 %r531, %r4, %r90;
+ shl.b32 %r161, %r531, 1;
+ add.s32 %r532, %r160, %r161;
+ or.b32 %r533, %r532, 1;
+ setp.ge.s32 %p183, %r533, %r211;
@%p183 bra $L__BB0_284;
- ld.param.u64 %rd189, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r537, %r161, %r160;
- mul.wide.s32 %rd185, %r537, 4;
- add.s64 %rd184, %rd189, %rd185;
+ ld.param.u64 %rd186, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S1_S2_S1_S2_S2_S1_S1_S1_NS0_IxLi1ELi1EEE_param_7];
+ add.s32 %r536, %r161, %r160;
+ mul.wide.s32 %rd185, %r536, 4;
+ add.s64 %rd184, %rd186, %rd185;
st.global.cs.v2.s32 [%rd184], {%r599,%r601};
$L__BB0_284:
Kernel 7
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-14
+14 index type: int
registers: 48
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 1, 1> T40, Tensor<float, 1, 1> T22, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T27, Tensor<float, 2, 2> T55, Tensor<float, 2, 2> T60, Tensor<int64_t, 1, 1> T65) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T38 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T37 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T34 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T38) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T55
// Allocate global tensor T60
__syncthreads();
Array<float, 4, 4> T56;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T56[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T61;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T61[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T54;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T59;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T59[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
} else {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
}
Array<float, 1, 1> T35;
T35[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T36;
T36[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T36[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T53;
T53[0] = 0.000000000e+00f;
Array<float, 1, 1> T64;
T64[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T49;
T49.set(float(0));
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
T59[i9]
= T59[i9]
+ T21[0];
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
T53[0]
= T53[0]
+ T9[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T64[0]
= T64[0]
+ T13[0];
}
} else {
Array<float, 4, 4> T52;
T52.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T59[i9]
= T59[i9]
+ T21[0];
}
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T53[0]
= T53[0]
+ T9[0];
}
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T64[0]
= T64[0]
+ T13[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T53[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T64[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
} else {
Array<float, 4, 4> T50;
T50.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T56[i6], T54[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T61[i7], T59[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T55[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T56[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T60[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T61[0]);
}
}
// Allocate global tensor T65
grid_sync::sync<false, true, false, true, true>(T65[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T63;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T63[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T62;
T62.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T62[0], &*(volatile float*)&T60[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T63[i12]
= T63[i12]
+ T62[i12];
}
}
Array<float, 2, 2> T42;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T42[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T42[i14], T63[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T42[0]);
}
Array<float, 2, 2> T39;
T39.set(float(0));
if (((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 2, 1> T58;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T58[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T57;
T57.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T57[0], &*(volatile float*)&T55[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T58[i15]
= T58[i15]
+ T57[i15];
}
}
Array<float, 2, 2> T41;
Array<float, 2, 2> T44;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
Array<float, 1, 1> T23;
T23[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T23[0], T58[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T41[i17]
= T23[0];
Array<float, 1, 1> T25;
T25[0]
= T23[0]
+ (float) 1.00000000000000000e+00;
T44[i17]
= T25[0]
+ T39[i17];
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T40[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T41[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T27[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T44[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 2, 2> T1, Tensor<float, 2, 2> T2, Tensor<float, 2, 2> T3, Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T4, Tensor<float, 1, 1> T5, Tensor<float, 1, 1> T40, Tensor<float, 1, 1> T22, Tensor<float, 2, 2> T20, Tensor<float, 1, 1> T27, Tensor<float, 2, 2> T55, Tensor<float, 2, 2> T60, Tensor<int64_t, 1, 1> T65) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[1LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T38 = reinterpret_cast<float*>(array + smem_offset + ((((((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4)) + 15) & -16));
float* T37 = reinterpret_cast<float*>(array + smem_offset + (((((((nvfuser_index_t)blockDim.y) * (ceilDiv(T0.logical_size[1LL], 4))) * 4) * 4) + 15) & -16));
float* T34 = reinterpret_cast<float*>(array + smem_offset + 0);
Tensor<float, 2, 2> s0;
s0.data = T0.data;
s0.logical_size = T0.logical_size;
s0.alloc_stride = T0.alloc_stride;
Array<nvfuser_index_t, 2, 1> a1;
a1 = s0.logical_size;
nvfuser_index_t i2;
i2 = a1[1LL];
double d3;
d3 = (double)(i2);
double d4;
d4 = 1.00000000000000000e+00 * d3;
double d5;
d5 = reciprocal(d4);
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)((toSmem(T38) + (16 * ((nvfuser_index_t)threadIdx.x))))),
"l"((T4.data + (4 * ((nvfuser_index_t)threadIdx.x)))),
"n"(16LL),
"r"((uint32_t)((!(((nvfuser_index_t)threadIdx.y) == 0))))
);
}
}
// Allocate global tensor T55
// Allocate global tensor T60
__syncthreads();
Array<float, 4, 4> T56;
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
T56[i6] = 0.000000000e+00f;
}
Array<float, 4, 4> T61;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
T61[i7] = 0.000000000e+00f;
}
Array<float, 4, 1> T54;
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8] = 0.000000000e+00f;
}
Array<float, 4, 1> T59;
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
T59[i9] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i10 = 0; i10 < (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i10) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
} else {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
}
Array<float, 1, 1> T35;
T35[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T35[0]
= T2[(((T2.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T2.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T2.alloc_stride[0LL]) * i10))];
}
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
Array<float, 1, 1> T36;
T36[0] = 0;
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
} else {
if ((((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64)) {
T36[0]
= T3[(((T3.alloc_stride[0LL] * ((nvfuser_index_t)threadIdx.y)) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * T3.alloc_stride[0LL]) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T3.alloc_stride[0LL]) * i10))];
}
}
Array<float, 1, 1> T19;
T19[0]
= (float) d5
* T36[0];
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
Array<float, 1, 1> T14;
T14[0] = 0.000000000e+00f;
asm volatile("cp.async.wait_all;\n");
Array<float, 1, 1> T53;
T53[0] = 0.000000000e+00f;
Array<float, 1, 1> T64;
T64[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T48;
T48.set(float(0));
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T49;
T49.set(float(0));
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T46;
T46.set(float(0));
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
T59[i9]
= T59[i9]
+ T21[0];
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
T53[0]
= T53[0]
+ T9[0];
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
T64[0]
= T64[0]
+ T13[0];
}
} else {
Array<float, 4, 4> T52;
T52.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
- T35[0];
Array<float, 1, 1> T7;
T7[0]
= T6[0]
* T36[0];
Array<float, 1, 1> T21;
T21[0]
= T46[i9]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T59[i9]
= T59[i9]
+ T21[0];
}
Array<float, 1, 1> T8;
T8[0]
= T52[i9];
Array<float, 1, 1> T9;
T9[0]
= T48[i9]
* T8[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T53[0]
= T53[0]
+ T9[0];
}
Array<float, 1, 1> T13;
T13[0]
= T9[0]
* T7[0];
if (((((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2) && (((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4)))) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
T64[0]
= T64[0]
+ T13[0];
}
}
}
blockReduce<true, false, false, true>(T11[0], T53[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
blockReduce<true, false, false, true>(T14[0], T64[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T12;
broadcast::blockBroadcast<true, false, false, true>(T12[0], T11[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T50;
T50.set(float(0));
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
} else {
Array<float, 4, 4> T50;
T50.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
T31[0]
= T51[i11];
Array<float, 1, 1> T32;
T32[0]
= T47[i11]
* T31[0];
Array<float, 1, 1> T10;
T10[0]
= (float) d4
* T32[0];
Array<float, 1, 1> T28;
T28[0]
= T50[i11]
- T35[0];
Array<float, 1, 1> T29;
T29[0]
= T28[0]
* T36[0];
Array<float, 1, 1> T17;
T17[0]
= T10[0]
- T12[0];
Array<float, 1, 1> T16;
T16[0]
= T29[0]
* T15[0];
Array<float, 1, 1> T18;
T18[0]
= T17[0]
- T16[0];
T43[i11]
= T19[0]
* T18[0];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i6 = 0; i6 < 4; ++i6) {
blockReduce<false, true, false, true>(T56[i6], T54[i6], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 4; ++i7) {
blockReduce<false, true, false, true>(T61[i7], T59[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T55[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T56[0]);
}
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T60[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)blockIdx.y)))], &T61[0]);
}
}
// Allocate global tensor T65
grid_sync::sync<false, true, false, true, true>(T65[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T63;
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T63[i12] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i13 = 0; i13 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i13) {
Array<float, 2, 2> T62;
T62.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i13)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T62[0], &*(volatile float*)&T60[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i13))]);
}
#pragma unroll
for(nvfuser_index_t i12 = 0; i12 < 2; ++i12) {
T63[i12]
= T63[i12]
+ T62[i12];
}
}
Array<float, 2, 2> T42;
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
T42[i14] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i14 = 0; i14 < 2; ++i14) {
blockReduce<true, false, false, true>(T42[i14], T63[i14], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T22[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T42[0]);
}
Array<float, 2, 2> T39;
T39.set(float(0));
if (((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/false, CacheOp::Streaming>(&T39[0], &T5[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))]);
}
Array<float, 2, 1> T58;
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T58[i15] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i16 = 0; i16 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i16) {
Array<float, 2, 2> T57;
T57.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2)) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i16)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T57[0], &*(volatile float*)&T55[((((i2 * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * i2) * i16))]);
}
#pragma unroll
for(nvfuser_index_t i15 = 0; i15 < 2; ++i15) {
T58[i15]
= T58[i15]
+ T57[i15];
}
}
Array<float, 2, 2> T41;
Array<float, 2, 2> T44;
#pragma unroll
for(nvfuser_index_t i17 = 0; i17 < 2; ++i17) {
Array<float, 1, 1> T23;
T23[0] = 0.000000000e+00f;
blockReduce<true, false, false, true>(T23[0], T58[i17], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
T41[i17]
= T23[0];
Array<float, 1, 1> T25;
T25[0]
= T23[0]
+ (float) 1.00000000000000000e+00;
T44[i17]
= T25[0]
+ T39[i17];
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T40[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T41[0]);
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(i2, 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < i2))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T27[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T44[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -67,32 +67,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T37) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((s0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
+ T45[i8];
}
} else {
Array<float, 4, 4> T45;
T45.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T45[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i8 = 0; i8 < 4; ++i8) {
T54[i8]
= T54[i8]
@@ -116,11 +116,11 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * i2) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T34) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T1.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
@@ -152,17 +152,17 @@
Array<float, 4, 4> T52;
T52.set(float(0));
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T48;
T48.set(float(0));
- loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T49;
T49.set(float(0));
- loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T46;
T46.set(float(0));
- loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
= T49[i9]
@@ -203,21 +203,21 @@
loadGeneric<float, 4>( &T52[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T48;
T48.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T48[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T49;
T49.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T49[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T46;
T46.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T46[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i9 = 0; i9 < 4; ++i9) {
Array<float, 1, 1> T6;
T6[0]
@@ -266,17 +266,17 @@
Array<float, 1, 1> T15;
broadcast::blockBroadcast<true, false, false, true>(T15[0], T14[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
Array<float, 4, 4> T50;
T50.set(float(0));
- loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T51;
T51.set(float(0));
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
Array<float, 4, 4> T47;
T47.set(float(0));
- loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
Array<float, 1, 1> T31;
@@ -317,21 +317,21 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T20[((((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * i2) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * i2) * i10))], &T43[0]);
} else {
Array<float, 4, 4> T50;
T50.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T50[0], &T34[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T51;
T51.set(float(0));
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2))) {
loadGeneric<float, 4>( &T51[0], &T38[(4 * ((nvfuser_index_t)threadIdx.x))]);
}
Array<float, 4, 4> T47;
T47.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(i2, 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < i2)) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv(64, ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i10)) < 64))) {
- loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + (i2 * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T47[0], &T37[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(i2, 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
// Alias Allocation - register
auto& T43 = T47;
#pragma unroll
for(nvfuser_index_t i11 = 0; i11 < 4; ++i11) {
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12[16]
)
{
.reg .pred %p<146>;
.reg .f32 %f<408>;
.reg .b32 %r<462>;
.reg .f64 %fd<3>;
.reg .b64 %rd<163>;
ld.param.v2.u32 {%r169, %r170}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
ld.param.v2.u32 {%r173, %r174}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r175, %r176}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+8];
ld.param.u64 %rd43, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd36, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r203, %r176, 3;
shr.s32 %r204, %r203, 31;
shr.u32 %r205, %r204, 30;
add.s32 %r206, %r203, %r205;
shr.s32 %r207, %r206, 2;
mov.u32 %r2, %ntid.x;
max.s32 %r208, %r207, %r2;
mov.u32 %r3, %ntid.y;
shl.b32 %r209, %r3, 2;
mad.lo.s32 %r210, %r209, %r208, 15;
and.b32 %r211, %r210, -16;
cvt.u64.u32 %rd1, %r211;
mul.lo.s32 %r212, %r3, %r207;
shl.b32 %r213, %r212, 4;
or.b32 %r214, %r213, 15;
and.b32 %r4, %r214, -16;
add.s32 %r215, %r214, %r4;
and.b32 %r216, %r215, -16;
cvt.s64.s32 %rd2, %r216;
mov.u64 %rd45, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_103395arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r5, %tid.x;
setp.lt.s32 %p9, %r5, %r207;
shl.b32 %r6, %r5, 2;
or.b32 %r217, %r6, 3;
setp.lt.s32 %p10, %r217, %r176;
and.pred %p1, %p10, %p9;
mov.u32 %r7, %tid.y;
setp.eq.s32 %p11, %r7, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r218, smem_ptr; }
// end inline asm
shl.b32 %r221, %r5, 4;
add.s32 %r219, %r218, %r221;
mul.wide.s32 %rd49, %r6, 4;
add.s64 %rd48, %rd36, %rd49;
mov.u32 %r220, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r220, 0;
cp.async.ca.shared.global [%r219], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r222, %r3, 63;
div.s32 %r223, %r222, %r3;
mov.u32 %r8, %nctaid.y;
add.s32 %r224, %r8, %r223;
add.s32 %r225, %r224, -1;
div.s32 %r9, %r225, %r8;
setp.gt.s32 %p13, %r9, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r176;
cvt.s64.s32 %rd50, %r4;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r227, %ctaid.y;
mul.lo.s32 %r228, %r9, %r3;
mul.lo.s32 %r10, %r228, %r227;
shl.b32 %r229, %r7, 2;
shl.b32 %r230, %r5, 4;
mad.lo.s32 %r11, %r229, %r176, %r230;
mul.lo.s32 %r231, %r176, %r7;
cvt.s64.s32 %rd54, %r231;
cvt.s64.s32 %rd55, %r6;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r232, %r10, %r176;
cvt.s64.s32 %rd6, %r232;
mul.lo.s32 %r12, %r176, %r3;
mul.lo.s32 %r13, %r9, %r227;
add.s32 %r14, %r231, %r6;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r14, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r233, %tid.z;
mad.lo.s32 %r234, %r3, %r233, %r7;
mad.lo.s32 %r15, %r234, %r2, %r5;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r235, %r2;
mov.u32 %r236, 31;
sub.s32 %r237, %r236, %r235;
mov.u32 %r238, 1;
shl.b32 %r16, %r238, %r237;
setp.lt.u32 %p14, %r5, %r16;
add.s32 %r239, %r16, %r5;
setp.lt.u32 %p15, %r239, %r2;
and.pred %p3, %p14, %p15;
add.s32 %r240, %r15, %r16;
mul.wide.s32 %rd59, %r240, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r241, %r16, 31;
add.s32 %r242, %r16, %r241;
shr.s32 %r17, %r242, 1;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r6, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r243, %r15, 1;
mul.wide.u32 %rd62, %r243, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
mul.wide.s32 %rd63, %r234, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd33;
cvta.to.global.u64 %rd16, %rd34;
add.s64 %rd19, %rd46, %rd51;
mov.u32 %r422, 0;
mov.f32 %f370, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r246, smem_ptr; }
// end inline asm
add.s32 %r247, %r11, %r246;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r272, smem_ptr; }
// end inline asm
add.s32 %r273, %r11, %r272;
not.pred %p26, %p3;
mov.f32 %f371, %f370;
mov.f32 %f372, %f370;
mov.f32 %f373, %f370;
mov.f32 %f382, %f370;
mov.f32 %f383, %f370;
mov.f32 %f384, %f370;
mov.f32 %f385, %f370;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r244, %r422, %r3, %r7;
add.s32 %r245, %r244, %r10;
setp.gt.s32 %p17, %r245, 63;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r249, %r12, %r422;
cvt.s64.s32 %rd67, %r249;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd35, %rd70;
mov.u32 %r248, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r248, 0;
cp.async.ca.shared.global [%r247], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r250, %r13, %r422;
mad.lo.s32 %r251, %r250, %r3, %r7;
setp.lt.s32 %p19, %r251, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r260, %r13, %r422;
mad.lo.s32 %r261, %r260, %r3, %r7;
setp.gt.s32 %p20, %r261, 63;
mov.u32 %r423, 0;
mov.u32 %r424, %r423;
mov.u32 %r425, %r423;
mov.u32 %r426, %r423;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r423, 0;
mov.u32 %r424, %r423;
mov.u32 %r425, %r423;
mov.u32 %r426, %r423;
$L__BB0_15:
add.s32 %r270, %r13, %r422;
mad.lo.s32 %r33, %r270, %r3, %r7;
mov.b32 %f117, %r426;
add.f32 %f385, %f385, %f117;
mov.b32 %f118, %r425;
add.f32 %f384, %f384, %f118;
mov.b32 %f119, %r424;
add.f32 %f383, %f383, %f119;
mov.b32 %f120, %r423;
add.f32 %f382, %f382, %f120;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f368, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r271, %r33, %r169;
mul.wide.s32 %rd71, %r271, 4;
add.s64 %rd72, %rd15, %rd71;
ld.global.f32 %f368, [%rd72];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r275, %r12, %r422;
cvt.s64.s32 %rd75, %r275;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd32, %rd78;
mov.u32 %r274, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r274, 0;
cp.async.ca.shared.global [%r273], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r421, %r13, %r422;
mad.lo.s32 %r420, %r421, %r3, %r7;
setp.gt.s32 %p145, %r420, 63;
mov.f32 %f374, 0f00000000;
mov.f32 %f369, %f374;
@%p145 bra $L__BB0_21;
mul.lo.s32 %r276, %r33, %r173;
mul.wide.s32 %rd79, %r276, 4;
add.s64 %rd80, %rd16, %rd79;
ld.global.f32 %f369, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f375, %f374;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd12];
sub.f32 %f129, %f124, %f368;
mul.f32 %f130, %f369, %f129;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd7];
fma.rn.f32 %f370, %f130, %f131, %f370;
ld.shared.v4.f32 {%f136, %f137, %f138, %f139}, [%rd10];
mul.f32 %f141, %f136, %f131;
add.f32 %f142, %f141, 0f00000000;
fma.rn.f32 %f143, %f130, %f141, 0f00000000;
sub.f32 %f145, %f125, %f368;
mul.f32 %f146, %f369, %f145;
fma.rn.f32 %f371, %f146, %f132, %f371;
mul.f32 %f149, %f137, %f132;
add.f32 %f150, %f142, %f149;
fma.rn.f32 %f151, %f146, %f149, %f143;
sub.f32 %f153, %f126, %f368;
mul.f32 %f154, %f369, %f153;
fma.rn.f32 %f372, %f154, %f133, %f372;
mul.f32 %f157, %f138, %f133;
add.f32 %f158, %f150, %f157;
fma.rn.f32 %f159, %f154, %f157, %f151;
sub.f32 %f161, %f127, %f368;
mul.f32 %f162, %f369, %f161;
fma.rn.f32 %f373, %f162, %f134, %f373;
mul.f32 %f165, %f139, %f134;
add.f32 %f375, %f158, %f165;
fma.rn.f32 %f374, %f162, %f165, %f159;
$L__BB0_23:
st.shared.f32 [%rd8], %f375;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f166, [%rd9];
ld.shared.f32 %f167, [%rd8];
add.f32 %f168, %f166, %f167;
st.shared.f32 [%rd8], %f168;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r427, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r5, %r427;
@%p28 bra $L__BB0_29;
add.s32 %r277, %r427, %r15;
mul.wide.s32 %rd81, %r277, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f169, [%rd8];
ld.shared.f32 %f170, [%rd83];
add.f32 %f171, %f170, %f169;
st.shared.f32 [%rd8], %f171;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r427, 1;
setp.gt.u32 %p29, %r427, 3;
mov.u32 %r427, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r5, 0;
mov.f32 %f376, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r2, 2;
ld.shared.f32 %f173, [%rd8];
add.f32 %f376, %f173, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f174, [%rd11];
add.f32 %f376, %f376, %f174;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f374;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f175, [%rd9];
ld.shared.f32 %f176, [%rd8];
add.f32 %f177, %f175, %f176;
st.shared.f32 [%rd8], %f177;
$L__BB0_35:
setp.lt.s32 %p142, %r16, 4;
bar.sync 0;
@%p142 bra $L__BB0_40;
mov.u32 %r428, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r5, %r428;
@%p34 bra $L__BB0_39;
add.s32 %r278, %r428, %r15;
mul.wide.s32 %rd84, %r278, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f178, [%rd8];
ld.shared.f32 %f179, [%rd86];
add.f32 %f180, %f179, %f178;
st.shared.f32 [%rd8], %f180;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r428, 1;
setp.gt.u32 %p35, %r428, 3;
mov.u32 %r428, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f377, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r2, 2;
ld.shared.f32 %f182, [%rd8];
add.f32 %f377, %f182, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f183, [%rd11];
add.f32 %f377, %f377, %f183;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f376;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f377;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f184, %f369, %f1;
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd10];
ld.shared.v4.f32 {%f190, %f191, %f192, %f193}, [%rd7];
mul.f32 %f195, %f185, %f190;
mul.f32 %f196, %f195, %f2;
ld.shared.v4.f32 {%f197, %f198, %f199, %f200}, [%rd12];
sub.f32 %f202, %f197, %f368;
mul.f32 %f203, %f369, %f202;
sub.f32 %f204, %f196, %f37;
mul.f32 %f205, %f38, %f203;
sub.f32 %f206, %f204, %f205;
mul.f32 %f207, %f184, %f206;
mov.b32 %r279, %f207;
mul.f32 %f210, %f186, %f191;
mul.f32 %f211, %f210, %f2;
sub.f32 %f213, %f198, %f368;
mul.f32 %f214, %f369, %f213;
sub.f32 %f215, %f211, %f37;
mul.f32 %f216, %f38, %f214;
sub.f32 %f217, %f215, %f216;
mul.f32 %f218, %f184, %f217;
mov.b32 %r280, %f218;
mul.f32 %f221, %f187, %f192;
mul.f32 %f222, %f221, %f2;
sub.f32 %f224, %f199, %f368;
mul.f32 %f225, %f369, %f224;
sub.f32 %f226, %f222, %f37;
mul.f32 %f227, %f38, %f225;
sub.f32 %f228, %f226, %f227;
mul.f32 %f229, %f184, %f228;
mov.b32 %r281, %f229;
mul.f32 %f232, %f188, %f193;
mul.f32 %f233, %f232, %f2;
sub.f32 %f235, %f200, %f368;
mul.f32 %f236, %f369, %f235;
sub.f32 %f237, %f233, %f37;
mul.f32 %f238, %f38, %f236;
sub.f32 %f239, %f237, %f238;
mul.f32 %f240, %f184, %f239;
mov.b32 %r282, %f240;
mad.lo.s32 %r283, %r422, %r3, %r10;
mad.lo.s32 %r284, %r283, %r176, %r14;
mul.wide.s32 %rd88, %r284, 4;
add.s64 %rd87, %rd40, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r279,%r280,%r281,%r282};
// end inline asm
$L__BB0_49:
add.s32 %r422, %r422, 1;
setp.lt.s32 %p41, %r422, %r9;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f370, 0f00000000;
mov.f32 %f371, %f370;
mov.f32 %f372, %f370;
mov.f32 %f373, %f370;
mov.f32 %f382, %f370;
mov.f32 %f383, %f370;
mov.f32 %f384, %f370;
mov.f32 %f385, %f370;
$L__BB0_50:
mov.u32 %r285, %tid.z;
mad.lo.s32 %r286, %r3, %r285, %r7;
mad.lo.s32 %r39, %r286, %r2, %r5;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
clz.b32 %r287, %r3;
mov.u32 %r288, 31;
sub.s32 %r289, %r288, %r287;
mov.u32 %r290, 1;
shl.b32 %r40, %r290, %r289;
setp.lt.u32 %p42, %r7, %r40;
add.s32 %r291, %r40, %r7;
setp.lt.u32 %p43, %r291, %r3;
and.pred %p5, %p42, %p43;
shl.b32 %r292, %r2, %r289;
add.s32 %r293, %r39, %r292;
mul.wide.s32 %rd91, %r293, 4;
add.s64 %rd24, %rd45, %rd91;
shr.u32 %r294, %r40, 31;
add.s32 %r295, %r40, %r294;
shr.s32 %r443, %r295, 1;
st.shared.f32 [%rd23], %f382;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f241, [%rd24];
ld.shared.f32 %f242, [%rd23];
add.f32 %f243, %f241, %f242;
st.shared.f32 [%rd23], %f243;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r429, %r443;
$L__BB0_54:
setp.ge.u32 %p46, %r7, %r429;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r296, %r429, %r2, %r39;
mul.wide.s32 %rd92, %r296, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f244, [%rd23];
ld.shared.f32 %f245, [%rd94];
add.f32 %f246, %f245, %f244;
st.shared.f32 [%rd23], %f246;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r429, 1;
setp.gt.u32 %p47, %r429, 3;
mov.u32 %r429, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r298, %r39, %r2;
mul.wide.u32 %rd95, %r298, 4;
add.s64 %rd25, %rd45, %rd95;
setp.ne.s32 %p48, %r7, 0;
mov.u32 %r430, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r3, 2;
ld.shared.f32 %f247, [%rd23];
add.f32 %f386, %f247, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f248, [%rd25];
add.f32 %f386, %f386, %f248;
$L__BB0_60:
mov.b32 %r430, %f386;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f383;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f249, [%rd24];
ld.shared.f32 %f250, [%rd23];
add.f32 %f251, %f249, %f250;
st.shared.f32 [%rd23], %f251;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r431, %r443;
$L__BB0_65:
setp.ge.u32 %p52, %r7, %r431;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r299, %r431, %r2, %r39;
mul.wide.s32 %rd97, %r299, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f252, [%rd23];
ld.shared.f32 %f253, [%rd99];
add.f32 %f254, %f253, %f252;
st.shared.f32 [%rd23], %f254;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r431, 1;
setp.gt.u32 %p53, %r431, 3;
mov.u32 %r431, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r432, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r3, 2;
ld.shared.f32 %f255, [%rd23];
add.f32 %f387, %f255, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f256, [%rd25];
add.f32 %f387, %f387, %f256;
$L__BB0_71:
mov.b32 %r432, %f387;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f384;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f257, [%rd24];
ld.shared.f32 %f258, [%rd23];
add.f32 %f259, %f257, %f258;
st.shared.f32 [%rd23], %f259;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r433, %r443;
$L__BB0_76:
setp.ge.u32 %p58, %r7, %r433;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r301, %r433, %r2, %r39;
mul.wide.s32 %rd100, %r301, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f260, [%rd23];
ld.shared.f32 %f261, [%rd102];
add.f32 %f262, %f261, %f260;
st.shared.f32 [%rd23], %f262;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r433, 1;
setp.gt.u32 %p59, %r433, 3;
mov.u32 %r433, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r434, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r3, 2;
ld.shared.f32 %f263, [%rd23];
add.f32 %f388, %f263, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f264, [%rd25];
add.f32 %f388, %f388, %f264;
$L__BB0_82:
mov.b32 %r434, %f388;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f385;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f265, [%rd24];
ld.shared.f32 %f266, [%rd23];
add.f32 %f267, %f265, %f266;
st.shared.f32 [%rd23], %f267;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r435, %r443;
$L__BB0_87:
setp.ge.u32 %p64, %r7, %r435;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r303, %r435, %r2, %r39;
mul.wide.s32 %rd103, %r303, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f268, [%rd23];
ld.shared.f32 %f269, [%rd105];
add.f32 %f270, %f269, %f268;
st.shared.f32 [%rd23], %f270;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r435, 1;
setp.gt.u32 %p65, %r435, 3;
mov.u32 %r435, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r436, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r3, 2;
ld.shared.f32 %f271, [%rd23];
add.f32 %f389, %f271, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f272, [%rd25];
add.f32 %f389, %f389, %f272;
$L__BB0_93:
mov.b32 %r436, %f389;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f370;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f273, [%rd24];
ld.shared.f32 %f274, [%rd23];
add.f32 %f275, %f273, %f274;
st.shared.f32 [%rd23], %f275;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r437, %r443;
$L__BB0_98:
setp.ge.u32 %p70, %r7, %r437;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r305, %r437, %r2, %r39;
mul.wide.s32 %rd106, %r305, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f276, [%rd23];
ld.shared.f32 %f277, [%rd108];
add.f32 %f278, %f277, %f276;
st.shared.f32 [%rd23], %f278;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r437, 1;
setp.gt.u32 %p71, %r437, 3;
mov.u32 %r437, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r438, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r3, 2;
ld.shared.f32 %f279, [%rd23];
add.f32 %f390, %f279, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f280, [%rd25];
add.f32 %f390, %f390, %f280;
$L__BB0_104:
mov.b32 %r438, %f390;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f371;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f281, [%rd24];
ld.shared.f32 %f282, [%rd23];
add.f32 %f283, %f281, %f282;
st.shared.f32 [%rd23], %f283;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r439, %r443;
$L__BB0_109:
setp.ge.u32 %p76, %r7, %r439;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r307, %r439, %r2, %r39;
mul.wide.s32 %rd109, %r307, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f284, [%rd23];
ld.shared.f32 %f285, [%rd111];
add.f32 %f286, %f285, %f284;
st.shared.f32 [%rd23], %f286;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r439, 1;
setp.gt.u32 %p77, %r439, 3;
mov.u32 %r439, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r440, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r3, 2;
ld.shared.f32 %f287, [%rd23];
add.f32 %f391, %f287, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f288, [%rd25];
add.f32 %f391, %f391, %f288;
$L__BB0_115:
mov.b32 %r440, %f391;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f372;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f289, [%rd24];
ld.shared.f32 %f290, [%rd23];
add.f32 %f291, %f289, %f290;
st.shared.f32 [%rd23], %f291;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r441, %r443;
$L__BB0_120:
setp.ge.u32 %p82, %r7, %r441;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r309, %r441, %r2, %r39;
mul.wide.s32 %rd112, %r309, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f292, [%rd23];
ld.shared.f32 %f293, [%rd114];
add.f32 %f294, %f293, %f292;
st.shared.f32 [%rd23], %f294;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r441, 1;
setp.gt.u32 %p83, %r441, 3;
mov.u32 %r441, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r442, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r3, 2;
ld.shared.f32 %f295, [%rd23];
add.f32 %f392, %f295, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f296, [%rd25];
add.f32 %f392, %f392, %f296;
$L__BB0_126:
mov.b32 %r442, %f392;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f373;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f297, [%rd24];
ld.shared.f32 %f298, [%rd23];
add.f32 %f299, %f297, %f298;
st.shared.f32 [%rd23], %f299;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r7, %r443;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r311, %r443, %r2, %r39;
mul.wide.s32 %rd115, %r311, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f300, [%rd23];
ld.shared.f32 %f301, [%rd117];
add.f32 %f302, %f301, %f300;
st.shared.f32 [%rd23], %f302;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r443, 1;
setp.gt.u32 %p89, %r443, 3;
mov.u32 %r443, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r444, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r3, 2;
ld.shared.f32 %f303, [%rd23];
add.f32 %f393, %f303, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f304, [%rd25];
add.f32 %f393, %f393, %f304;
$L__BB0_136:
mov.b32 %r444, %f393;
$L__BB0_137:
setp.eq.s32 %p144, %r7, 0;
and.pred %p143, %p144, %p1;
bar.sync 0;
@%p143 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
shl.b32 %r419, %r5, 2;
mov.u32 %r321, %ctaid.y;
mad.lo.s32 %r322, %r176, %r321, %r419;
mul.wide.s32 %rd120, %r322, 4;
add.s64 %rd118, %rd42, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd118], {%r430,%r432,%r434,%r436};
// end inline asm
add.s64 %rd119, %rd43, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r438,%r440,%r442,%r444};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r323, %r5, %r7;
or.b32 %r325, %r323, %r285;
setp.ne.s32 %p92, %r325, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd161, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12];
cvta.to.global.u64 %rd121, %rd161;
mov.u32 %r326, %ctaid.x;
mov.u32 %r327, %ctaid.z;
mov.u32 %r328, %nctaid.x;
mad.lo.s32 %r329, %r327, %r328, %r326;
mul.wide.s32 %rd122, %r329, 8;
add.s64 %rd28, %rd121, %rd122;
add.s32 %r330, %r8, -1;
setp.eq.s32 %p93, %r74, %r330;
cvt.s64.s32 %rd123, %r8;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p94, %rd128, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r445, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r445;
// end inline asm
setp.lt.u32 %p95, %r445, 256;
selp.u32 %r333, 1, 0, %p95;
shl.b32 %r445, %r445, %r333;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p96, %rd130, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r334, %r8, %r2;
add.s32 %r335, %r334, -1;
div.s32 %r77, %r335, %r2;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f396, 0f00000000;
mov.f32 %f397, %f396;
@%p97 bra $L__BB0_149;
add.s32 %r337, %r176, 1;
shr.u32 %r338, %r337, 31;
add.s32 %r339, %r337, %r338;
shr.s32 %r340, %r339, 1;
add.s32 %r341, %r3, %r340;
add.s32 %r342, %r341, -1;
shl.b32 %r343, %r7, 1;
shl.b32 %r344, %r3, 1;
mad.lo.s32 %r345, %r344, %r74, %r343;
or.b32 %r346, %r345, 1;
setp.ge.s32 %p98, %r346, %r176;
div.s32 %r347, %r342, %r3;
setp.ge.s32 %p99, %r74, %r347;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r348, %r3, %r74;
shl.b32 %r349, %r348, 1;
mad.lo.s32 %r350, %r176, %r5, %r349;
add.s32 %r447, %r350, %r343;
mul.lo.s32 %r79, %r176, %r2;
mov.u32 %r336, 0;
mov.f32 %f396, 0f00000000;
mov.u32 %r446, %r5;
mov.u32 %r448, %r336;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r449, %r336;
mov.u32 %r450, %r336;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r446, %r8;
mov.u32 %r449, %r336;
mov.u32 %r450, %r336;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd132, %r447, 4;
add.s64 %rd131, %rd43, %rd132;
// begin inline asm
ld.volatile.global.v2.s32 {%r450,%r449}, [%rd131];
// end inline asm
$L__BB0_148:
mov.b32 %f309, %r450;
add.f32 %f396, %f396, %f309;
mov.b32 %f310, %r449;
add.f32 %f397, %f397, %f310;
add.s32 %r447, %r447, %r79;
add.s32 %r446, %r446, %r2;
add.s32 %r448, %r448, 1;
setp.lt.s32 %p101, %r448, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r357, %r2;
mov.u32 %r358, 31;
sub.s32 %r359, %r358, %r357;
mov.u32 %r360, 1;
shl.b32 %r90, %r360, %r359;
setp.lt.u32 %p102, %r5, %r90;
add.s32 %r361, %r90, %r5;
setp.lt.u32 %p103, %r361, %r2;
and.pred %p7, %p102, %p103;
add.s32 %r362, %r39, %r90;
mul.wide.s32 %rd133, %r362, 4;
add.s64 %rd30, %rd45, %rd133;
shr.u32 %r363, %r90, 31;
add.s32 %r364, %r90, %r363;
shr.s32 %r461, %r364, 1;
st.shared.f32 [%rd23], %f396;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f311, [%rd30];
ld.shared.f32 %f312, [%rd23];
add.f32 %f313, %f311, %f312;
st.shared.f32 [%rd23], %f313;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r451, %r461;
$L__BB0_153:
setp.ge.u32 %p106, %r5, %r451;
@%p106 bra $L__BB0_155;
add.s32 %r365, %r451, %r39;
mul.wide.s32 %rd135, %r365, 4;
add.s64 %rd137, %rd45, %rd135;
ld.shared.f32 %f314, [%rd23];
ld.shared.f32 %f315, [%rd137];
add.f32 %f316, %f315, %f314;
st.shared.f32 [%rd23], %f316;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r451, 1;
setp.gt.u32 %p107, %r451, 3;
mov.u32 %r451, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r367, %r39, 1;
mul.wide.u32 %rd138, %r367, 4;
add.s64 %rd31, %rd45, %rd138;
setp.ne.s32 %p108, %r5, 0;
mov.u32 %r452, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r2, 2;
ld.shared.f32 %f317, [%rd23];
add.f32 %f398, %f317, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f318, [%rd31];
add.f32 %f398, %f398, %f318;
$L__BB0_159:
mov.b32 %r452, %f398;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f397;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f319, [%rd30];
ld.shared.f32 %f320, [%rd23];
add.f32 %f321, %f319, %f320;
st.shared.f32 [%rd23], %f321;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r453, %r461;
$L__BB0_164:
setp.ge.u32 %p112, %r5, %r453;
@%p112 bra $L__BB0_166;
add.s32 %r368, %r453, %r39;
mul.wide.s32 %rd140, %r368, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f322, [%rd23];
ld.shared.f32 %f323, [%rd142];
add.f32 %f324, %f323, %f322;
st.shared.f32 [%rd23], %f324;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r453, 1;
setp.gt.u32 %p113, %r453, 3;
mov.u32 %r453, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r454, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r2, 2;
ld.shared.f32 %f325, [%rd23];
add.f32 %f399, %f325, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f326, [%rd31];
add.f32 %f399, %f399, %f326;
$L__BB0_170:
mov.b32 %r454, %f399;
$L__BB0_171:
bar.sync 0;
setp.eq.s32 %p116, %r5, 0;
@%p116 bra $L__BB0_172;
bra.uni $L__BB0_175;
$L__BB0_172:
add.s32 %r370, %r176, 1;
shr.u32 %r371, %r370, 31;
add.s32 %r372, %r370, %r371;
shr.s32 %r373, %r372, 1;
add.s32 %r374, %r3, %r373;
add.s32 %r375, %r374, -1;
div.s32 %r376, %r375, %r3;
setp.ge.s32 %p117, %r74, %r376;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r7, 1;
mul.lo.s32 %r377, %r3, %r74;
shl.b32 %r101, %r377, 1;
add.s32 %r378, %r100, %r101;
or.b32 %r379, %r378, 1;
setp.ge.s32 %p118, %r379, %r176;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd160, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r382, %r101, %r100;
mul.wide.s32 %rd144, %r382, 4;
add.s64 %rd143, %rd160, %rd144;
// begin inline asm
st.global.cs.v2.s32 [%rd143], {%r452,%r454};
// end inline asm
$L__BB0_175:
add.s32 %r383, %r176, 1;
shr.u32 %r384, %r383, 31;
add.s32 %r385, %r383, %r384;
shr.s32 %r386, %r385, 1;
add.s32 %r387, %r3, %r386;
add.s32 %r388, %r387, -1;
div.s32 %r102, %r388, %r3;
setp.ge.s32 %p119, %r74, %r102;
mov.f32 %f400, 0f00000000;
mov.f32 %f404, 0f00000000;
mov.f32 %f401, %f404;
@%p119 bra $L__BB0_178;
shl.b32 %r103, %r7, 1;
mul.lo.s32 %r389, %r3, %r74;
shl.b32 %r104, %r389, 1;
add.s32 %r390, %r103, %r104;
or.b32 %r391, %r390, 1;
setp.ge.s32 %p120, %r391, %r176;
@%p120 bra $L__BB0_178;
ld.param.u64 %rd159, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5];
add.s32 %r394, %r104, %r103;
mul.wide.s32 %rd146, %r394, 4;
add.s64 %rd145, %rd159, %rd146;
// begin inline asm
ld.global.cs.v2.u32 {%r392,%r393}, [%rd145];
// end inline asm
mov.b32 %f400, %r392;
mov.b32 %f401, %r393;
$L__BB0_178:
mov.f32 %f405, %f404;
@%p97 bra $L__BB0_184;
shl.b32 %r396, %r7, 1;
shl.b32 %r397, %r3, 1;
mad.lo.s32 %r398, %r397, %r74, %r396;
or.b32 %r399, %r398, 1;
setp.ge.s32 %p122, %r399, %r176;
or.pred %p8, %p122, %p119;
mul.lo.s32 %r400, %r3, %r74;
shl.b32 %r401, %r400, 1;
mad.lo.s32 %r402, %r176, %r5, %r401;
add.s32 %r456, %r402, %r396;
mul.lo.s32 %r106, %r176, %r2;
mov.u32 %r395, 0;
mov.f32 %f404, 0f00000000;
mov.u32 %r455, %r5;
mov.f32 %f405, %f404;
mov.u32 %r457, %r395;
$L__BB0_180:
.pragma "nounroll";
mov.u32 %r458, %r395;
mov.u32 %r459, %r395;
@%p8 bra $L__BB0_183;
setp.ge.s32 %p124, %r455, %r8;
mov.u32 %r458, %r395;
mov.u32 %r459, %r395;
@%p124 bra $L__BB0_183;
mul.wide.s32 %rd148, %r456, 4;
add.s64 %rd147, %rd42, %rd148;
// begin inline asm
ld.volatile.global.v2.s32 {%r459,%r458}, [%rd147];
// end inline asm
$L__BB0_183:
mov.b32 %f335, %r459;
add.f32 %f404, %f404, %f335;
mov.b32 %f336, %r458;
add.f32 %f405, %f405, %f336;
add.s32 %r456, %r456, %r106;
add.s32 %r455, %r455, %r2;
add.s32 %r457, %r457, 1;
setp.lt.s32 %p125, %r457, %r77;
@%p125 bra $L__BB0_180;
$L__BB0_184:
st.shared.f32 [%rd23], %f404;
bar.sync 0;
@%p104 bra $L__BB0_186;
ld.shared.f32 %f337, [%rd30];
ld.shared.f32 %f338, [%rd23];
add.f32 %f339, %f337, %f338;
st.shared.f32 [%rd23], %f339;
$L__BB0_186:
bar.sync 0;
@%p105 bra $L__BB0_191;
mov.u32 %r460, %r461;
$L__BB0_188:
setp.ge.u32 %p128, %r5, %r460;
@%p128 bra $L__BB0_190;
add.s32 %r409, %r460, %r39;
mul.wide.s32 %rd149, %r409, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f340, [%rd23];
ld.shared.f32 %f341, [%rd151];
add.f32 %f342, %f341, %f340;
st.shared.f32 [%rd23], %f342;
$L__BB0_190:
bar.sync 0;
shr.u32 %r118, %r460, 1;
setp.gt.u32 %p129, %r460, 3;
mov.u32 %r460, %r118;
@%p129 bra $L__BB0_188;
$L__BB0_191:
mov.f32 %f406, 0f00000000;
@%p108 bra $L__BB0_194;
setp.lt.u32 %p131, %r2, 2;
ld.shared.f32 %f344, [%rd23];
add.f32 %f406, %f344, 0f00000000;
@%p131 bra $L__BB0_194;
ld.shared.f32 %f345, [%rd31];
add.f32 %f406, %f406, %f345;
$L__BB0_194:
bar.sync 0;
st.shared.f32 [%rd23], %f405;
bar.sync 0;
@%p104 bra $L__BB0_196;
ld.shared.f32 %f346, [%rd30];
ld.shared.f32 %f347, [%rd23];
add.f32 %f348, %f346, %f347;
st.shared.f32 [%rd23], %f348;
$L__BB0_196:
bar.sync 0;
@%p105 bra $L__BB0_200;
$L__BB0_197:
setp.ge.u32 %p134, %r5, %r461;
@%p134 bra $L__BB0_199;
add.s32 %r410, %r461, %r39;
mul.wide.s32 %rd152, %r410, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f349, [%rd23];
ld.shared.f32 %f350, [%rd154];
add.f32 %f351, %f350, %f349;
st.shared.f32 [%rd23], %f351;
$L__BB0_199:
bar.sync 0;
shr.u32 %r120, %r461, 1;
setp.gt.u32 %p135, %r461, 3;
mov.u32 %r461, %r120;
@%p135 bra $L__BB0_197;
$L__BB0_200:
add.f32 %f353, %f406, 0f3F800000;
add.f32 %f96, %f353, %f400;
mov.f32 %f407, 0f00000000;
@%p108 bra $L__BB0_203;
setp.lt.u32 %p137, %r2, 2;
ld.shared.f32 %f354, [%rd23];
add.f32 %f407, %f354, 0f00000000;
@%p137 bra $L__BB0_203;
ld.shared.f32 %f355, [%rd31];
add.f32 %f407, %f407, %f355;
$L__BB0_203:
bar.sync 0;
or.pred %p140, %p108, %p119;
@%p140 bra $L__BB0_206;
shl.b32 %r121, %r7, 1;
mul.lo.s32 %r411, %r3, %r74;
shl.b32 %r122, %r411, 1;
add.s32 %r412, %r121, %r122;
or.b32 %r413, %r412, 1;
setp.ge.s32 %p141, %r413, %r176;
@%p141 bra $L__BB0_206;
ld.param.u64 %rd162, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd158, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_85edfb01_1033910nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
add.s32 %r418, %r122, %r121;
mul.wide.s32 %rd157, %r418, 4;
add.s64 %rd155, %rd158, %rd157;
mov.b32 %r415, %f407;
mov.b32 %r414, %f406;
// begin inline asm
st.global.cs.v2.s32 [%rd155], {%r414,%r415};
// end inline asm
add.s64 %rd156, %rd162, %rd157;
add.f32 %f356, %f407, 0f3F800000;
add.f32 %f357, %f356, %f401;
mov.b32 %r417, %f357;
mov.b32 %r416, %f96;
// begin inline asm
st.global.cs.v2.s32 [%rd156], {%r416,%r417};
// end inline asm
$L__BB0_206:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12[16]
)
{
.reg .pred %p<146>;
.reg .f32 %f<408>;
.reg .b32 %r<460>;
.reg .f64 %fd<3>;
.reg .b64 %rd<163>;
ld.param.v2.u32 {%r169, %r170}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
ld.param.v2.u32 {%r173, %r174}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2+16];
ld.param.v2.u32 {%r175, %r176}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3+8];
ld.param.u64 %rd43, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_11];
ld.param.u64 %rd42, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_10];
ld.param.u64 %rd40, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_8];
ld.param.u64 %rd36, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd35, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd34, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd33, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd32, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r203, %r176, 3;
shr.s32 %r204, %r203, 31;
shr.u32 %r205, %r204, 30;
add.s32 %r206, %r203, %r205;
shr.s32 %r2, %r206, 2;
mov.u32 %r3, %ntid.x;
max.s32 %r207, %r2, %r3;
mov.u32 %r4, %ntid.y;
shl.b32 %r208, %r4, 2;
mad.lo.s32 %r209, %r208, %r207, 15;
and.b32 %r210, %r209, -16;
cvt.u64.u32 %rd1, %r210;
mul.lo.s32 %r211, %r4, %r2;
shl.b32 %r212, %r211, 4;
or.b32 %r213, %r212, 15;
and.b32 %r5, %r213, -16;
add.s32 %r214, %r213, %r5;
and.b32 %r215, %r214, -16;
cvt.s64.s32 %rd2, %r215;
mov.u64 %rd45, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_72335arrayE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
mov.u32 %r6, %tid.x;
setp.lt.s32 %p9, %r6, %r2;
shl.b32 %r7, %r6, 2;
or.b32 %r216, %r7, 3;
setp.lt.s32 %p10, %r216, %r176;
and.pred %p1, %p10, %p9;
mov.u32 %r8, %tid.y;
setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r217, smem_ptr; }
// end inline asm
shl.b32 %r220, %r6, 4;
add.s32 %r218, %r217, %r220;
mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd36, %rd49;
mov.u32 %r219, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r219, 0;
cp.async.ca.shared.global [%r218], [%rd48], 16, p0;
}
// end inline asm
$L__BB0_2:
bar.sync 0;
add.s32 %r221, %r4, 63;
div.s32 %r222, %r221, %r4;
mov.u32 %r9, %nctaid.y;
add.s32 %r223, %r9, %r222;
add.s32 %r224, %r223, -1;
div.s32 %r10, %r224, %r9;
setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r176;
cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
mov.u32 %r226, %ctaid.y;
mul.lo.s32 %r227, %r10, %r4;
mul.lo.s32 %r11, %r227, %r226;
mad.lo.s32 %r228, %r2, %r8, %r6;
shl.b32 %r12, %r228, 4;
mul.lo.s32 %r229, %r176, %r8;
cvt.s64.s32 %rd54, %r229;
cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
mul.lo.s32 %r230, %r11, %r176;
cvt.s64.s32 %rd6, %r230;
mul.lo.s32 %r13, %r176, %r4;
mul.lo.s32 %r14, %r10, %r226;
shl.b32 %r231, %r8, 2;
mad.lo.s32 %r232, %r231, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
mul.wide.s32 %rd57, %r232, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r233, %tid.z;
mad.lo.s32 %r234, %r4, %r233, %r8;
mad.lo.s32 %r15, %r234, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
clz.b32 %r235, %r3;
mov.u32 %r236, 31;
sub.s32 %r237, %r236, %r235;
mov.u32 %r238, 1;
shl.b32 %r16, %r238, %r237;
setp.lt.u32 %p14, %r6, %r16;
add.s32 %r239, %r16, %r6;
setp.lt.u32 %p15, %r239, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r240, %r15, %r16;
mul.wide.s32 %rd59, %r240, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r241, %r16, 31;
add.s32 %r242, %r16, %r241;
shr.s32 %r17, %r242, 1;
add.s64 %rd60, %rd45, %rd4;
mul.wide.s32 %rd61, %r7, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r243, %r15, 1;
mul.wide.u32 %rd62, %r243, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
mul.wide.s32 %rd63, %r234, 4;
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd33;
cvta.to.global.u64 %rd16, %rd34;
add.s64 %rd19, %rd46, %rd51;
mov.u32 %r420, 0;
mov.f32 %f370, 0f00000000;
not.pred %p16, %p1;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r246, smem_ptr; }
// end inline asm
add.s32 %r247, %r246, %r12;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r272, smem_ptr; }
// end inline asm
add.s32 %r273, %r272, %r12;
not.pred %p26, %p3;
mov.f32 %f371, %f370;
mov.f32 %f372, %f370;
mov.f32 %f373, %f370;
mov.f32 %f382, %f370;
mov.f32 %f383, %f370;
mov.f32 %f384, %f370;
mov.f32 %f385, %f370;
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
mad.lo.s32 %r244, %r420, %r4, %r8;
add.s32 %r245, %r244, %r11;
setp.gt.s32 %p17, %r245, 63;
@%p17 bra $L__BB0_8;
mul.lo.s32 %r249, %r13, %r420;
cvt.s64.s32 %rd67, %r249;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd35, %rd70;
mov.u32 %r248, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r248, 0;
cp.async.ca.shared.global [%r247], [%rd66], 16, p0;
}
// end inline asm
$L__BB0_8:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p16 bra $L__BB0_10;
add.s32 %r250, %r14, %r420;
mad.lo.s32 %r251, %r250, %r4, %r8;
setp.lt.s32 %p19, %r251, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
add.s32 %r260, %r14, %r420;
mad.lo.s32 %r261, %r260, %r4, %r8;
setp.gt.s32 %p20, %r261, 63;
mov.u32 %r421, 0;
mov.u32 %r422, %r421;
mov.u32 %r423, %r421;
mov.u32 %r424, %r421;
@%p20 bra $L__BB0_15;
ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
mov.u32 %r421, 0;
mov.u32 %r422, %r421;
mov.u32 %r423, %r421;
mov.u32 %r424, %r421;
$L__BB0_15:
add.s32 %r270, %r14, %r420;
mad.lo.s32 %r33, %r270, %r4, %r8;
mov.b32 %f117, %r424;
add.f32 %f385, %f385, %f117;
mov.b32 %f118, %r423;
add.f32 %f384, %f384, %f118;
mov.b32 %f119, %r422;
add.f32 %f383, %f383, %f119;
mov.b32 %f120, %r421;
add.f32 %f382, %f382, %f120;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f368, 0f00000000;
@%p21 bra $L__BB0_17;
mul.lo.s32 %r271, %r33, %r169;
mul.wide.s32 %rd71, %r271, 4;
add.s64 %rd72, %rd15, %rd71;
ld.global.f32 %f368, [%rd72];
$L__BB0_17:
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
mul.lo.s32 %r275, %r13, %r420;
cvt.s64.s32 %rd75, %r275;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd32, %rd78;
mov.u32 %r274, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r274, 0;
cp.async.ca.shared.global [%r273], [%rd74], 16, p0;
}
// end inline asm
$L__BB0_19:
add.s32 %r419, %r14, %r420;
mad.lo.s32 %r418, %r419, %r4, %r8;
setp.gt.s32 %p145, %r418, 63;
mov.f32 %f374, 0f00000000;
mov.f32 %f369, %f374;
@%p145 bra $L__BB0_21;
mul.lo.s32 %r276, %r33, %r173;
mul.wide.s32 %rd79, %r276, 4;
add.s64 %rd80, %rd16, %rd79;
ld.global.f32 %f369, [%rd80];
$L__BB0_21:
// begin inline asm
cp.async.wait_all;
// end inline asm
mov.f32 %f375, %f374;
@%p23 bra $L__BB0_23;
ld.shared.v4.f32 {%f124, %f125, %f126, %f127}, [%rd12];
sub.f32 %f129, %f124, %f368;
mul.f32 %f130, %f369, %f129;
ld.shared.v4.f32 {%f131, %f132, %f133, %f134}, [%rd7];
fma.rn.f32 %f370, %f130, %f131, %f370;
ld.shared.v4.f32 {%f136, %f137, %f138, %f139}, [%rd10];
mul.f32 %f141, %f136, %f131;
add.f32 %f142, %f141, 0f00000000;
fma.rn.f32 %f143, %f130, %f141, 0f00000000;
sub.f32 %f145, %f125, %f368;
mul.f32 %f146, %f369, %f145;
fma.rn.f32 %f371, %f146, %f132, %f371;
mul.f32 %f149, %f137, %f132;
add.f32 %f150, %f142, %f149;
fma.rn.f32 %f151, %f146, %f149, %f143;
sub.f32 %f153, %f126, %f368;
mul.f32 %f154, %f369, %f153;
fma.rn.f32 %f372, %f154, %f133, %f372;
mul.f32 %f157, %f138, %f133;
add.f32 %f158, %f150, %f157;
fma.rn.f32 %f159, %f154, %f157, %f151;
sub.f32 %f161, %f127, %f368;
mul.f32 %f162, %f369, %f161;
fma.rn.f32 %f373, %f162, %f134, %f373;
mul.f32 %f165, %f139, %f134;
add.f32 %f375, %f158, %f165;
fma.rn.f32 %f374, %f162, %f165, %f159;
$L__BB0_23:
st.shared.f32 [%rd8], %f375;
bar.sync 0;
@%p26 bra $L__BB0_25;
ld.shared.f32 %f166, [%rd9];
ld.shared.f32 %f167, [%rd8];
add.f32 %f168, %f166, %f167;
st.shared.f32 [%rd8], %f168;
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
mov.u32 %r425, %r17;
$L__BB0_27:
setp.ge.u32 %p28, %r6, %r425;
@%p28 bra $L__BB0_29;
add.s32 %r277, %r425, %r15;
mul.wide.s32 %rd81, %r277, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f169, [%rd8];
ld.shared.f32 %f170, [%rd83];
add.f32 %f171, %f170, %f169;
st.shared.f32 [%rd8], %f171;
$L__BB0_29:
bar.sync 0;
shr.u32 %r35, %r425, 1;
setp.gt.u32 %p29, %r425, 3;
mov.u32 %r425, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
setp.ne.s32 %p30, %r6, 0;
mov.f32 %f376, 0f00000000;
@%p30 bra $L__BB0_33;
setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f173, [%rd8];
add.f32 %f376, %f173, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f174, [%rd11];
add.f32 %f376, %f376, %f174;
$L__BB0_33:
bar.sync 0;
st.shared.f32 [%rd8], %f374;
bar.sync 0;
@%p26 bra $L__BB0_35;
ld.shared.f32 %f175, [%rd9];
ld.shared.f32 %f176, [%rd8];
add.f32 %f177, %f175, %f176;
st.shared.f32 [%rd8], %f177;
$L__BB0_35:
setp.lt.s32 %p142, %r16, 4;
bar.sync 0;
@%p142 bra $L__BB0_40;
mov.u32 %r426, %r17;
$L__BB0_37:
setp.ge.u32 %p34, %r6, %r426;
@%p34 bra $L__BB0_39;
add.s32 %r278, %r426, %r15;
mul.wide.s32 %rd84, %r278, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f178, [%rd8];
ld.shared.f32 %f179, [%rd86];
add.f32 %f180, %f179, %f178;
st.shared.f32 [%rd8], %f180;
$L__BB0_39:
bar.sync 0;
shr.u32 %r37, %r426, 1;
setp.gt.u32 %p35, %r426, 3;
mov.u32 %r426, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f377, 0f00000000;
@%p30 bra $L__BB0_43;
setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f182, [%rd8];
add.f32 %f377, %f182, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f183, [%rd11];
add.f32 %f377, %f377, %f183;
$L__BB0_43:
bar.sync 0;
@%p30 bra $L__BB0_45;
st.shared.f32 [%rd13], %f376;
$L__BB0_45:
bar.sync 0;
ld.shared.f32 %f37, [%rd13];
bar.sync 0;
@%p30 bra $L__BB0_47;
st.shared.f32 [%rd13], %f377;
$L__BB0_47:
bar.sync 0;
ld.shared.f32 %f38, [%rd13];
bar.sync 0;
@%p23 bra $L__BB0_49;
mul.f32 %f184, %f369, %f1;
ld.shared.v4.f32 {%f185, %f186, %f187, %f188}, [%rd10];
ld.shared.v4.f32 {%f190, %f191, %f192, %f193}, [%rd7];
mul.f32 %f195, %f185, %f190;
mul.f32 %f196, %f195, %f2;
ld.shared.v4.f32 {%f197, %f198, %f199, %f200}, [%rd12];
sub.f32 %f202, %f197, %f368;
mul.f32 %f203, %f369, %f202;
sub.f32 %f204, %f196, %f37;
mul.f32 %f205, %f38, %f203;
sub.f32 %f206, %f204, %f205;
mul.f32 %f207, %f184, %f206;
mov.b32 %r279, %f207;
mul.f32 %f210, %f186, %f191;
mul.f32 %f211, %f210, %f2;
sub.f32 %f213, %f198, %f368;
mul.f32 %f214, %f369, %f213;
sub.f32 %f215, %f211, %f37;
mul.f32 %f216, %f38, %f214;
sub.f32 %f217, %f215, %f216;
mul.f32 %f218, %f184, %f217;
mov.b32 %r280, %f218;
mul.f32 %f221, %f187, %f192;
mul.f32 %f222, %f221, %f2;
sub.f32 %f224, %f199, %f368;
mul.f32 %f225, %f369, %f224;
sub.f32 %f226, %f222, %f37;
mul.f32 %f227, %f38, %f225;
sub.f32 %f228, %f226, %f227;
mul.f32 %f229, %f184, %f228;
mov.b32 %r281, %f229;
mul.f32 %f232, %f188, %f193;
mul.f32 %f233, %f232, %f2;
sub.f32 %f235, %f200, %f368;
mul.f32 %f236, %f369, %f235;
sub.f32 %f237, %f233, %f37;
mul.f32 %f238, %f38, %f236;
sub.f32 %f239, %f237, %f238;
mul.f32 %f240, %f184, %f239;
mov.b32 %r282, %f240;
mad.lo.s32 %r283, %r33, %r176, %r7;
mul.wide.s32 %rd88, %r283, 4;
add.s64 %rd87, %rd40, %rd88;
// begin inline asm
st.global.cs.v4.s32 [%rd87], {%r279,%r280,%r281,%r282};
// end inline asm
$L__BB0_49:
add.s32 %r420, %r420, 1;
setp.lt.s32 %p41, %r420, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f370, 0f00000000;
mov.f32 %f371, %f370;
mov.f32 %f372, %f370;
mov.f32 %f373, %f370;
mov.f32 %f382, %f370;
mov.f32 %f383, %f370;
mov.f32 %f384, %f370;
mov.f32 %f385, %f370;
$L__BB0_50:
mov.u32 %r284, %tid.z;
mad.lo.s32 %r285, %r4, %r284, %r8;
mad.lo.s32 %r39, %r285, %r3, %r6;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
clz.b32 %r286, %r4;
mov.u32 %r287, 31;
sub.s32 %r288, %r287, %r286;
mov.u32 %r289, 1;
shl.b32 %r40, %r289, %r288;
setp.lt.u32 %p42, %r8, %r40;
add.s32 %r290, %r40, %r8;
setp.lt.u32 %p43, %r290, %r4;
and.pred %p5, %p42, %p43;
shl.b32 %r291, %r3, %r288;
add.s32 %r292, %r39, %r291;
mul.wide.s32 %rd91, %r292, 4;
add.s64 %rd24, %rd45, %rd91;
shr.u32 %r293, %r40, 31;
add.s32 %r294, %r40, %r293;
shr.s32 %r441, %r294, 1;
st.shared.f32 [%rd23], %f382;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
ld.shared.f32 %f241, [%rd24];
ld.shared.f32 %f242, [%rd23];
add.f32 %f243, %f241, %f242;
st.shared.f32 [%rd23], %f243;
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
mov.u32 %r427, %r441;
$L__BB0_54:
setp.ge.u32 %p46, %r8, %r427;
@%p46 bra $L__BB0_56;
mad.lo.s32 %r295, %r427, %r3, %r39;
mul.wide.s32 %rd92, %r295, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f244, [%rd23];
ld.shared.f32 %f245, [%rd94];
add.f32 %f246, %f245, %f244;
st.shared.f32 [%rd23], %f246;
$L__BB0_56:
bar.sync 0;
shr.u32 %r43, %r427, 1;
setp.gt.u32 %p47, %r427, 3;
mov.u32 %r427, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
add.s32 %r297, %r39, %r3;
mul.wide.u32 %rd95, %r297, 4;
add.s64 %rd25, %rd45, %rd95;
setp.ne.s32 %p48, %r8, 0;
mov.u32 %r428, 0;
@%p48 bra $L__BB0_61;
setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f247, [%rd23];
add.f32 %f386, %f247, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f248, [%rd25];
add.f32 %f386, %f386, %f248;
$L__BB0_60:
mov.b32 %r428, %f386;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f383;
bar.sync 0;
@%p44 bra $L__BB0_63;
ld.shared.f32 %f249, [%rd24];
ld.shared.f32 %f250, [%rd23];
add.f32 %f251, %f249, %f250;
st.shared.f32 [%rd23], %f251;
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
mov.u32 %r429, %r441;
$L__BB0_65:
setp.ge.u32 %p52, %r8, %r429;
@%p52 bra $L__BB0_67;
mad.lo.s32 %r298, %r429, %r3, %r39;
mul.wide.s32 %rd97, %r298, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f252, [%rd23];
ld.shared.f32 %f253, [%rd99];
add.f32 %f254, %f253, %f252;
st.shared.f32 [%rd23], %f254;
$L__BB0_67:
bar.sync 0;
shr.u32 %r47, %r429, 1;
setp.gt.u32 %p53, %r429, 3;
mov.u32 %r429, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
mov.u32 %r430, 0;
@%p48 bra $L__BB0_72;
setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f255, [%rd23];
add.f32 %f387, %f255, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f256, [%rd25];
add.f32 %f387, %f387, %f256;
$L__BB0_71:
mov.b32 %r430, %f387;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f384;
bar.sync 0;
@%p44 bra $L__BB0_74;
ld.shared.f32 %f257, [%rd24];
ld.shared.f32 %f258, [%rd23];
add.f32 %f259, %f257, %f258;
st.shared.f32 [%rd23], %f259;
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
mov.u32 %r431, %r441;
$L__BB0_76:
setp.ge.u32 %p58, %r8, %r431;
@%p58 bra $L__BB0_78;
mad.lo.s32 %r300, %r431, %r3, %r39;
mul.wide.s32 %rd100, %r300, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f260, [%rd23];
ld.shared.f32 %f261, [%rd102];
add.f32 %f262, %f261, %f260;
st.shared.f32 [%rd23], %f262;
$L__BB0_78:
bar.sync 0;
shr.u32 %r51, %r431, 1;
setp.gt.u32 %p59, %r431, 3;
mov.u32 %r431, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
mov.u32 %r432, 0;
@%p48 bra $L__BB0_83;
setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f263, [%rd23];
add.f32 %f388, %f263, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f264, [%rd25];
add.f32 %f388, %f388, %f264;
$L__BB0_82:
mov.b32 %r432, %f388;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f385;
bar.sync 0;
@%p44 bra $L__BB0_85;
ld.shared.f32 %f265, [%rd24];
ld.shared.f32 %f266, [%rd23];
add.f32 %f267, %f265, %f266;
st.shared.f32 [%rd23], %f267;
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
mov.u32 %r433, %r441;
$L__BB0_87:
setp.ge.u32 %p64, %r8, %r433;
@%p64 bra $L__BB0_89;
mad.lo.s32 %r302, %r433, %r3, %r39;
mul.wide.s32 %rd103, %r302, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f268, [%rd23];
ld.shared.f32 %f269, [%rd105];
add.f32 %f270, %f269, %f268;
st.shared.f32 [%rd23], %f270;
$L__BB0_89:
bar.sync 0;
shr.u32 %r55, %r433, 1;
setp.gt.u32 %p65, %r433, 3;
mov.u32 %r433, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
mov.u32 %r434, 0;
@%p48 bra $L__BB0_94;
setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f271, [%rd23];
add.f32 %f389, %f271, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f272, [%rd25];
add.f32 %f389, %f389, %f272;
$L__BB0_93:
mov.b32 %r434, %f389;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f370;
bar.sync 0;
@%p44 bra $L__BB0_96;
ld.shared.f32 %f273, [%rd24];
ld.shared.f32 %f274, [%rd23];
add.f32 %f275, %f273, %f274;
st.shared.f32 [%rd23], %f275;
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
mov.u32 %r435, %r441;
$L__BB0_98:
setp.ge.u32 %p70, %r8, %r435;
@%p70 bra $L__BB0_100;
mad.lo.s32 %r304, %r435, %r3, %r39;
mul.wide.s32 %rd106, %r304, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f276, [%rd23];
ld.shared.f32 %f277, [%rd108];
add.f32 %f278, %f277, %f276;
st.shared.f32 [%rd23], %f278;
$L__BB0_100:
bar.sync 0;
shr.u32 %r59, %r435, 1;
setp.gt.u32 %p71, %r435, 3;
mov.u32 %r435, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
mov.u32 %r436, 0;
@%p48 bra $L__BB0_105;
setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f279, [%rd23];
add.f32 %f390, %f279, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f280, [%rd25];
add.f32 %f390, %f390, %f280;
$L__BB0_104:
mov.b32 %r436, %f390;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f371;
bar.sync 0;
@%p44 bra $L__BB0_107;
ld.shared.f32 %f281, [%rd24];
ld.shared.f32 %f282, [%rd23];
add.f32 %f283, %f281, %f282;
st.shared.f32 [%rd23], %f283;
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
mov.u32 %r437, %r441;
$L__BB0_109:
setp.ge.u32 %p76, %r8, %r437;
@%p76 bra $L__BB0_111;
mad.lo.s32 %r306, %r437, %r3, %r39;
mul.wide.s32 %rd109, %r306, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f284, [%rd23];
ld.shared.f32 %f285, [%rd111];
add.f32 %f286, %f285, %f284;
st.shared.f32 [%rd23], %f286;
$L__BB0_111:
bar.sync 0;
shr.u32 %r63, %r437, 1;
setp.gt.u32 %p77, %r437, 3;
mov.u32 %r437, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
mov.u32 %r438, 0;
@%p48 bra $L__BB0_116;
setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f287, [%rd23];
add.f32 %f391, %f287, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f288, [%rd25];
add.f32 %f391, %f391, %f288;
$L__BB0_115:
mov.b32 %r438, %f391;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f372;
bar.sync 0;
@%p44 bra $L__BB0_118;
ld.shared.f32 %f289, [%rd24];
ld.shared.f32 %f290, [%rd23];
add.f32 %f291, %f289, %f290;
st.shared.f32 [%rd23], %f291;
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
mov.u32 %r439, %r441;
$L__BB0_120:
setp.ge.u32 %p82, %r8, %r439;
@%p82 bra $L__BB0_122;
mad.lo.s32 %r308, %r439, %r3, %r39;
mul.wide.s32 %rd112, %r308, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f292, [%rd23];
ld.shared.f32 %f293, [%rd114];
add.f32 %f294, %f293, %f292;
st.shared.f32 [%rd23], %f294;
$L__BB0_122:
bar.sync 0;
shr.u32 %r67, %r439, 1;
setp.gt.u32 %p83, %r439, 3;
mov.u32 %r439, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
mov.u32 %r440, 0;
@%p48 bra $L__BB0_127;
setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f295, [%rd23];
add.f32 %f392, %f295, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f296, [%rd25];
add.f32 %f392, %f392, %f296;
$L__BB0_126:
mov.b32 %r440, %f392;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f373;
bar.sync 0;
@%p44 bra $L__BB0_129;
ld.shared.f32 %f297, [%rd24];
ld.shared.f32 %f298, [%rd23];
add.f32 %f299, %f297, %f298;
st.shared.f32 [%rd23], %f299;
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
setp.ge.u32 %p88, %r8, %r441;
@%p88 bra $L__BB0_132;
mad.lo.s32 %r310, %r441, %r3, %r39;
mul.wide.s32 %rd115, %r310, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f300, [%rd23];
ld.shared.f32 %f301, [%rd117];
add.f32 %f302, %f301, %f300;
st.shared.f32 [%rd23], %f302;
$L__BB0_132:
bar.sync 0;
shr.u32 %r71, %r441, 1;
setp.gt.u32 %p89, %r441, 3;
mov.u32 %r441, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
mov.u32 %r442, 0;
@%p48 bra $L__BB0_137;
setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f303, [%rd23];
add.f32 %f393, %f303, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f304, [%rd25];
add.f32 %f393, %f393, %f304;
$L__BB0_136:
mov.b32 %r442, %f393;
$L__BB0_137:
setp.eq.s32 %p144, %r8, 0;
and.pred %p143, %p144, %p1;
bar.sync 0;
@%p143 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
mov.u32 %r320, %ctaid.y;
mad.lo.s32 %r321, %r176, %r320, %r7;
mul.wide.s32 %rd120, %r321, 4;
add.s64 %rd118, %rd42, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd118], {%r428,%r430,%r432,%r434};
// end inline asm
add.s64 %rd119, %rd43, %rd120;
// begin inline asm
st.volatile.global.v4.s32 [%rd119], {%r436,%r438,%r440,%r442};
// end inline asm
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r322, %r6, %r8;
or.b32 %r324, %r322, %r284;
setp.ne.s32 %p92, %r324, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd161, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12];
cvta.to.global.u64 %rd121, %rd161;
mov.u32 %r325, %ctaid.x;
mov.u32 %r326, %ctaid.z;
mov.u32 %r327, %nctaid.x;
mad.lo.s32 %r328, %r326, %r327, %r325;
mul.wide.s32 %rd122, %r328, 8;
add.s64 %rd28, %rd121, %rd122;
add.s32 %r329, %r9, -1;
setp.eq.s32 %p93, %r74, %r329;
cvt.s64.s32 %rd123, %r9;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p94, %rd128, 0;
@%p94 bra $L__BB0_143;
mov.u32 %r443, 8;
$L__BB0_142:
// begin inline asm
nanosleep.u32 %r443;
// end inline asm
setp.lt.u32 %p95, %r443, 256;
selp.u32 %r332, 1, 0, %p95;
shl.b32 %r443, %r443, %r332;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p96, %rd130, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
add.s32 %r333, %r9, %r3;
add.s32 %r334, %r333, -1;
div.s32 %r77, %r334, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f396, 0f00000000;
mov.f32 %f397, %f396;
@%p97 bra $L__BB0_149;
add.s32 %r336, %r176, 1;
shr.u32 %r337, %r336, 31;
add.s32 %r338, %r336, %r337;
shr.s32 %r339, %r338, 1;
add.s32 %r340, %r4, %r339;
add.s32 %r341, %r340, -1;
shl.b32 %r342, %r8, 1;
shl.b32 %r343, %r4, 1;
mad.lo.s32 %r344, %r343, %r74, %r342;
or.b32 %r345, %r344, 1;
setp.ge.s32 %p98, %r345, %r176;
div.s32 %r346, %r341, %r4;
setp.ge.s32 %p99, %r74, %r346;
or.pred %p6, %p99, %p98;
mul.lo.s32 %r347, %r4, %r74;
shl.b32 %r348, %r347, 1;
mad.lo.s32 %r349, %r176, %r6, %r348;
add.s32 %r445, %r349, %r342;
mul.lo.s32 %r79, %r176, %r3;
mov.u32 %r335, 0;
mov.f32 %f396, 0f00000000;
mov.u32 %r444, %r6;
mov.u32 %r446, %r335;
$L__BB0_145:
.pragma "nounroll";
mov.u32 %r447, %r335;
mov.u32 %r448, %r335;
@%p6 bra $L__BB0_148;
setp.ge.s32 %p100, %r444, %r9;
mov.u32 %r447, %r335;
mov.u32 %r448, %r335;
@%p100 bra $L__BB0_148;
mul.wide.s32 %rd132, %r445, 4;
add.s64 %rd131, %rd43, %rd132;
// begin inline asm
ld.volatile.global.v2.s32 {%r448,%r447}, [%rd131];
// end inline asm
$L__BB0_148:
mov.b32 %f309, %r448;
add.f32 %f396, %f396, %f309;
mov.b32 %f310, %r447;
add.f32 %f397, %f397, %f310;
add.s32 %r445, %r445, %r79;
add.s32 %r444, %r444, %r3;
add.s32 %r446, %r446, 1;
setp.lt.s32 %p101, %r446, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
clz.b32 %r356, %r3;
mov.u32 %r357, 31;
sub.s32 %r358, %r357, %r356;
mov.u32 %r359, 1;
shl.b32 %r90, %r359, %r358;
setp.lt.u32 %p102, %r6, %r90;
add.s32 %r360, %r90, %r6;
setp.lt.u32 %p103, %r360, %r3;
and.pred %p7, %p102, %p103;
add.s32 %r361, %r39, %r90;
mul.wide.s32 %rd133, %r361, 4;
add.s64 %rd30, %rd45, %rd133;
shr.u32 %r362, %r90, 31;
add.s32 %r363, %r90, %r362;
shr.s32 %r459, %r363, 1;
st.shared.f32 [%rd23], %f396;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
ld.shared.f32 %f311, [%rd30];
ld.shared.f32 %f312, [%rd23];
add.f32 %f313, %f311, %f312;
st.shared.f32 [%rd23], %f313;
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
mov.u32 %r449, %r459;
$L__BB0_153:
setp.ge.u32 %p106, %r6, %r449;
@%p106 bra $L__BB0_155;
add.s32 %r364, %r449, %r39;
mul.wide.s32 %rd135, %r364, 4;
add.s64 %rd137, %rd45, %rd135;
ld.shared.f32 %f314, [%rd23];
ld.shared.f32 %f315, [%rd137];
add.f32 %f316, %f315, %f314;
st.shared.f32 [%rd23], %f316;
$L__BB0_155:
bar.sync 0;
shr.u32 %r93, %r449, 1;
setp.gt.u32 %p107, %r449, 3;
mov.u32 %r449, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
add.s32 %r366, %r39, 1;
mul.wide.u32 %rd138, %r366, 4;
add.s64 %rd31, %rd45, %rd138;
setp.ne.s32 %p108, %r6, 0;
mov.u32 %r450, 0;
@%p108 bra $L__BB0_160;
setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f317, [%rd23];
add.f32 %f398, %f317, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f318, [%rd31];
add.f32 %f398, %f398, %f318;
$L__BB0_159:
mov.b32 %r450, %f398;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f397;
bar.sync 0;
@%p104 bra $L__BB0_162;
ld.shared.f32 %f319, [%rd30];
ld.shared.f32 %f320, [%rd23];
add.f32 %f321, %f319, %f320;
st.shared.f32 [%rd23], %f321;
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
mov.u32 %r451, %r459;
$L__BB0_164:
setp.ge.u32 %p112, %r6, %r451;
@%p112 bra $L__BB0_166;
add.s32 %r367, %r451, %r39;
mul.wide.s32 %rd140, %r367, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f322, [%rd23];
ld.shared.f32 %f323, [%rd142];
add.f32 %f324, %f323, %f322;
st.shared.f32 [%rd23], %f324;
$L__BB0_166:
bar.sync 0;
shr.u32 %r97, %r451, 1;
setp.gt.u32 %p113, %r451, 3;
mov.u32 %r451, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
mov.u32 %r452, 0;
@%p108 bra $L__BB0_171;
setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f325, [%rd23];
add.f32 %f399, %f325, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f326, [%rd31];
add.f32 %f399, %f399, %f326;
$L__BB0_170:
mov.b32 %r452, %f399;
$L__BB0_171:
bar.sync 0;
setp.eq.s32 %p116, %r6, 0;
@%p116 bra $L__BB0_172;
bra.uni $L__BB0_175;
$L__BB0_172:
add.s32 %r369, %r176, 1;
shr.u32 %r370, %r369, 31;
add.s32 %r371, %r369, %r370;
shr.s32 %r372, %r371, 1;
add.s32 %r373, %r4, %r372;
add.s32 %r374, %r373, -1;
div.s32 %r375, %r374, %r4;
setp.ge.s32 %p117, %r74, %r375;
@%p117 bra $L__BB0_175;
shl.b32 %r100, %r8, 1;
mul.lo.s32 %r376, %r4, %r74;
shl.b32 %r101, %r376, 1;
add.s32 %r377, %r100, %r101;
or.b32 %r378, %r377, 1;
setp.ge.s32 %p118, %r378, %r176;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd160, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
add.s32 %r381, %r101, %r100;
mul.wide.s32 %rd144, %r381, 4;
add.s64 %rd143, %rd160, %rd144;
// begin inline asm
st.global.cs.v2.s32 [%rd143], {%r450,%r452};
// end inline asm
$L__BB0_175:
add.s32 %r382, %r176, 1;
shr.u32 %r383, %r382, 31;
add.s32 %r384, %r382, %r383;
shr.s32 %r385, %r384, 1;
add.s32 %r386, %r4, %r385;
add.s32 %r387, %r386, -1;
div.s32 %r102, %r387, %r4;
setp.ge.s32 %p119, %r74, %r102;
mov.f32 %f400, 0f00000000;
mov.f32 %f404, 0f00000000;
mov.f32 %f401, %f404;
@%p119 bra $L__BB0_178;
shl.b32 %r103, %r8, 1;
mul.lo.s32 %r388, %r4, %r74;
shl.b32 %r104, %r388, 1;
add.s32 %r389, %r103, %r104;
or.b32 %r390, %r389, 1;
setp.ge.s32 %p120, %r390, %r176;
@%p120 bra $L__BB0_178;
ld.param.u64 %rd159, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5];
add.s32 %r393, %r104, %r103;
mul.wide.s32 %rd146, %r393, 4;
add.s64 %rd145, %rd159, %rd146;
// begin inline asm
ld.global.cs.v2.u32 {%r391,%r392}, [%rd145];
// end inline asm
mov.b32 %f400, %r391;
mov.b32 %f401, %r392;
$L__BB0_178:
mov.f32 %f405, %f404;
@%p97 bra $L__BB0_184;
shl.b32 %r395, %r8, 1;
shl.b32 %r396, %r4, 1;
mad.lo.s32 %r397, %r396, %r74, %r395;
or.b32 %r398, %r397, 1;
setp.ge.s32 %p122, %r398, %r176;
or.pred %p8, %p122, %p119;
mul.lo.s32 %r399, %r4, %r74;
shl.b32 %r400, %r399, 1;
mad.lo.s32 %r401, %r176, %r6, %r400;
add.s32 %r454, %r401, %r395;
mul.lo.s32 %r106, %r176, %r3;
mov.u32 %r394, 0;
mov.f32 %f404, 0f00000000;
mov.u32 %r453, %r6;
mov.f32 %f405, %f404;
mov.u32 %r455, %r394;
$L__BB0_180:
.pragma "nounroll";
mov.u32 %r456, %r394;
mov.u32 %r457, %r394;
@%p8 bra $L__BB0_183;
setp.ge.s32 %p124, %r453, %r9;
mov.u32 %r456, %r394;
mov.u32 %r457, %r394;
@%p124 bra $L__BB0_183;
mul.wide.s32 %rd148, %r454, 4;
add.s64 %rd147, %rd42, %rd148;
// begin inline asm
ld.volatile.global.v2.s32 {%r457,%r456}, [%rd147];
// end inline asm
$L__BB0_183:
mov.b32 %f335, %r457;
add.f32 %f404, %f404, %f335;
mov.b32 %f336, %r456;
add.f32 %f405, %f405, %f336;
add.s32 %r454, %r454, %r106;
add.s32 %r453, %r453, %r3;
add.s32 %r455, %r455, 1;
setp.lt.s32 %p125, %r455, %r77;
@%p125 bra $L__BB0_180;
$L__BB0_184:
st.shared.f32 [%rd23], %f404;
bar.sync 0;
@%p104 bra $L__BB0_186;
ld.shared.f32 %f337, [%rd30];
ld.shared.f32 %f338, [%rd23];
add.f32 %f339, %f337, %f338;
st.shared.f32 [%rd23], %f339;
$L__BB0_186:
bar.sync 0;
@%p105 bra $L__BB0_191;
mov.u32 %r458, %r459;
$L__BB0_188:
setp.ge.u32 %p128, %r6, %r458;
@%p128 bra $L__BB0_190;
add.s32 %r408, %r458, %r39;
mul.wide.s32 %rd149, %r408, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f340, [%rd23];
ld.shared.f32 %f341, [%rd151];
add.f32 %f342, %f341, %f340;
st.shared.f32 [%rd23], %f342;
$L__BB0_190:
bar.sync 0;
shr.u32 %r118, %r458, 1;
setp.gt.u32 %p129, %r458, 3;
mov.u32 %r458, %r118;
@%p129 bra $L__BB0_188;
$L__BB0_191:
mov.f32 %f406, 0f00000000;
@%p108 bra $L__BB0_194;
setp.lt.u32 %p131, %r3, 2;
ld.shared.f32 %f344, [%rd23];
add.f32 %f406, %f344, 0f00000000;
@%p131 bra $L__BB0_194;
ld.shared.f32 %f345, [%rd31];
add.f32 %f406, %f406, %f345;
$L__BB0_194:
bar.sync 0;
st.shared.f32 [%rd23], %f405;
bar.sync 0;
@%p104 bra $L__BB0_196;
ld.shared.f32 %f346, [%rd30];
ld.shared.f32 %f347, [%rd23];
add.f32 %f348, %f346, %f347;
st.shared.f32 [%rd23], %f348;
$L__BB0_196:
bar.sync 0;
@%p105 bra $L__BB0_200;
$L__BB0_197:
setp.ge.u32 %p134, %r6, %r459;
@%p134 bra $L__BB0_199;
add.s32 %r409, %r459, %r39;
mul.wide.s32 %rd152, %r409, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f349, [%rd23];
ld.shared.f32 %f350, [%rd154];
add.f32 %f351, %f350, %f349;
st.shared.f32 [%rd23], %f351;
$L__BB0_199:
bar.sync 0;
shr.u32 %r120, %r459, 1;
setp.gt.u32 %p135, %r459, 3;
mov.u32 %r459, %r120;
@%p135 bra $L__BB0_197;
$L__BB0_200:
add.f32 %f353, %f406, 0f3F800000;
add.f32 %f96, %f353, %f400;
mov.f32 %f407, 0f00000000;
@%p108 bra $L__BB0_203;
setp.lt.u32 %p137, %r3, 2;
ld.shared.f32 %f354, [%rd23];
add.f32 %f407, %f354, 0f00000000;
@%p137 bra $L__BB0_203;
ld.shared.f32 %f355, [%rd31];
add.f32 %f407, %f407, %f355;
$L__BB0_203:
bar.sync 0;
or.pred %p140, %p108, %p119;
@%p140 bra $L__BB0_206;
shl.b32 %r121, %r8, 1;
mul.lo.s32 %r410, %r4, %r74;
shl.b32 %r122, %r410, 1;
add.s32 %r411, %r121, %r122;
or.b32 %r412, %r411, 1;
setp.ge.s32 %p141, %r412, %r176;
@%p141 bra $L__BB0_206;
ld.param.u64 %rd162, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd158, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_63_cu_37f2fec9_723310nvfuser_63ENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
add.s32 %r417, %r122, %r121;
mul.wide.s32 %rd157, %r417, 4;
add.s64 %rd155, %rd158, %rd157;
mov.b32 %r414, %f407;
mov.b32 %r413, %f406;
// begin inline asm
st.global.cs.v2.s32 [%rd155], {%r413,%r414};
// end inline asm
add.s64 %rd156, %rd162, %rd157;
add.f32 %f356, %f407, 0f3F800000;
add.f32 %f357, %f356, %f401;
mov.b32 %r416, %f357;
mov.b32 %r415, %f96;
// begin inline asm
st.global.cs.v2.s32 [%rd156], {%r415,%r416};
// end inline asm
$L__BB0_206:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -31,11 +31,11 @@
.param .align 8 .b8 _ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12[16]
)
{
.reg .pred %p<146>;
.reg .f32 %f<408>;
- .reg .b32 %r<462>;
+ .reg .b32 %r<460>;
.reg .f64 %fd<3>;
.reg .b64 %rd<163>;
ld.param.v2.u32 {%r169, %r170}, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_1+16];
@@ -51,119 +51,119 @@
ld.param.u64 %rd32, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_0];
add.s32 %r203, %r176, 3;
shr.s32 %r204, %r203, 31;
shr.u32 %r205, %r204, 30;
add.s32 %r206, %r203, %r205;
- shr.s32 %r207, %r206, 2;
- mov.u32 %r2, %ntid.x;
- max.s32 %r208, %r207, %r2;
- mov.u32 %r3, %ntid.y;
- shl.b32 %r209, %r3, 2;
- mad.lo.s32 %r210, %r209, %r208, 15;
- and.b32 %r211, %r210, -16;
- cvt.u64.u32 %rd1, %r211;
- mul.lo.s32 %r212, %r3, %r207;
- shl.b32 %r213, %r212, 4;
- or.b32 %r214, %r213, 15;
- and.b32 %r4, %r214, -16;
- add.s32 %r215, %r214, %r4;
- and.b32 %r216, %r215, -16;
- cvt.s64.s32 %rd2, %r216;
+ shr.s32 %r2, %r206, 2;
+ mov.u32 %r3, %ntid.x;
+ max.s32 %r207, %r2, %r3;
+ mov.u32 %r4, %ntid.y;
+ shl.b32 %r208, %r4, 2;
+ mad.lo.s32 %r209, %r208, %r207, 15;
+ and.b32 %r210, %r209, -16;
+ cvt.u64.u32 %rd1, %r210;
+ mul.lo.s32 %r211, %r4, %r2;
+ shl.b32 %r212, %r211, 4;
+ or.b32 %r213, %r212, 15;
+ and.b32 %r5, %r213, -16;
+ add.s32 %r214, %r213, %r5;
+ and.b32 %r215, %r214, -16;
+ cvt.s64.s32 %rd2, %r215;
mov.u64 %rd45, _ZN11kernelscope6kernelE;
cvta.shared.u64 %rd46, %rd45;
add.s64 %rd3, %rd46, %rd1;
- mov.u32 %r5, %tid.x;
- setp.lt.s32 %p9, %r5, %r207;
- shl.b32 %r6, %r5, 2;
- or.b32 %r217, %r6, 3;
- setp.lt.s32 %p10, %r217, %r176;
+ mov.u32 %r6, %tid.x;
+ setp.lt.s32 %p9, %r6, %r2;
+ shl.b32 %r7, %r6, 2;
+ or.b32 %r216, %r7, 3;
+ setp.lt.s32 %p10, %r216, %r176;
and.pred %p1, %p10, %p9;
- mov.u32 %r7, %tid.y;
- setp.eq.s32 %p11, %r7, 0;
+ mov.u32 %r8, %tid.y;
+ setp.eq.s32 %p11, %r8, 0;
and.pred %p2, %p11, %p1;
not.pred %p12, %p2;
@%p12 bra $L__BB0_2;
add.s64 %rd47, %rd3, %rd2;
- { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r218, smem_ptr; }
-
-
- shl.b32 %r221, %r5, 4;
- add.s32 %r219, %r218, %r221;
- mul.wide.s32 %rd49, %r6, 4;
+ { .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd47; cvt.u32.u64 %r217, smem_ptr; }
+
+
+ shl.b32 %r220, %r6, 4;
+ add.s32 %r218, %r217, %r220;
+ mul.wide.s32 %rd49, %r7, 4;
add.s64 %rd48, %rd36, %rd49;
- mov.u32 %r220, 0;
+ mov.u32 %r219, 0;
{
.reg .pred p0;
- setp.ne.b32 p0, %r220, 0;
- cp.async.ca.shared.global [%r219], [%rd48], 16, p0;
+ setp.ne.b32 p0, %r219, 0;
+ cp.async.ca.shared.global [%r218], [%rd48], 16, p0;
}
$L__BB0_2:
bar.sync 0;
- add.s32 %r222, %r3, 63;
- div.s32 %r223, %r222, %r3;
- mov.u32 %r8, %nctaid.y;
- add.s32 %r224, %r8, %r223;
- add.s32 %r225, %r224, -1;
- div.s32 %r9, %r225, %r8;
- setp.gt.s32 %p13, %r9, 0;
+ add.s32 %r221, %r4, 63;
+ div.s32 %r222, %r221, %r4;
+ mov.u32 %r9, %nctaid.y;
+ add.s32 %r223, %r9, %r222;
+ add.s32 %r224, %r223, -1;
+ div.s32 %r10, %r224, %r9;
+ setp.gt.s32 %p13, %r10, 0;
add.s64 %rd4, %rd2, %rd1;
@%p13 bra $L__BB0_4;
bra.uni $L__BB0_3;
$L__BB0_4:
cvt.rn.f64.s32 %fd1, %r176;
- cvt.s64.s32 %rd50, %r4;
+ cvt.s64.s32 %rd50, %r5;
add.s64 %rd51, %rd1, %rd50;
add.s64 %rd53, %rd45, %rd1;
- mov.u32 %r227, %ctaid.y;
- mul.lo.s32 %r228, %r9, %r3;
- mul.lo.s32 %r10, %r228, %r227;
- shl.b32 %r229, %r7, 2;
- shl.b32 %r230, %r5, 4;
- mad.lo.s32 %r11, %r229, %r176, %r230;
- mul.lo.s32 %r231, %r176, %r7;
- cvt.s64.s32 %rd54, %r231;
- cvt.s64.s32 %rd55, %r6;
+ mov.u32 %r226, %ctaid.y;
+ mul.lo.s32 %r227, %r10, %r4;
+ mul.lo.s32 %r11, %r227, %r226;
+ mad.lo.s32 %r228, %r2, %r8, %r6;
+ shl.b32 %r12, %r228, 4;
+ mul.lo.s32 %r229, %r176, %r8;
+ cvt.s64.s32 %rd54, %r229;
+ cvt.s64.s32 %rd55, %r7;
add.s64 %rd5, %rd54, %rd55;
- mul.lo.s32 %r232, %r10, %r176;
- cvt.s64.s32 %rd6, %r232;
- mul.lo.s32 %r12, %r176, %r3;
- mul.lo.s32 %r13, %r9, %r227;
- add.s32 %r14, %r231, %r6;
+ mul.lo.s32 %r230, %r11, %r176;
+ cvt.s64.s32 %rd6, %r230;
+ mul.lo.s32 %r13, %r176, %r4;
+ mul.lo.s32 %r14, %r10, %r226;
+ shl.b32 %r231, %r8, 2;
+ mad.lo.s32 %r232, %r231, %r2, %r7;
add.s64 %rd56, %rd45, %rd51;
- mul.wide.s32 %rd57, %r14, 4;
+ mul.wide.s32 %rd57, %r232, 4;
add.s64 %rd7, %rd56, %rd57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.f32.f64 %f1, %fd2;
mov.u32 %r233, %tid.z;
- mad.lo.s32 %r234, %r3, %r233, %r7;
- mad.lo.s32 %r15, %r234, %r2, %r5;
+ mad.lo.s32 %r234, %r4, %r233, %r8;
+ mad.lo.s32 %r15, %r234, %r3, %r6;
mul.wide.u32 %rd58, %r15, 4;
add.s64 %rd8, %rd45, %rd58;
- clz.b32 %r235, %r2;
+ clz.b32 %r235, %r3;
mov.u32 %r236, 31;
sub.s32 %r237, %r236, %r235;
mov.u32 %r238, 1;
shl.b32 %r16, %r238, %r237;
- setp.lt.u32 %p14, %r5, %r16;
- add.s32 %r239, %r16, %r5;
- setp.lt.u32 %p15, %r239, %r2;
+ setp.lt.u32 %p14, %r6, %r16;
+ add.s32 %r239, %r16, %r6;
+ setp.lt.u32 %p15, %r239, %r3;
and.pred %p3, %p14, %p15;
add.s32 %r240, %r15, %r16;
mul.wide.s32 %rd59, %r240, 4;
add.s64 %rd9, %rd45, %rd59;
shr.u32 %r241, %r16, 31;
add.s32 %r242, %r16, %r241;
shr.s32 %r17, %r242, 1;
add.s64 %rd60, %rd45, %rd4;
- mul.wide.s32 %rd61, %r6, 4;
+ mul.wide.s32 %rd61, %r7, 4;
add.s64 %rd10, %rd60, %rd61;
add.s32 %r243, %r15, 1;
mul.wide.u32 %rd62, %r243, 4;
add.s64 %rd11, %rd45, %rd62;
add.s64 %rd12, %rd53, %rd57;
@@ -171,23 +171,23 @@
add.s64 %rd13, %rd45, %rd63;
cvt.rn.f32.f64 %f2, %fd1;
cvta.to.global.u64 %rd15, %rd33;
cvta.to.global.u64 %rd16, %rd34;
add.s64 %rd19, %rd46, %rd51;
- mov.u32 %r422, 0;
+ mov.u32 %r420, 0;
mov.f32 %f370, 0f00000000;
not.pred %p16, %p1;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd19; cvt.u32.u64 %r246, smem_ptr; }
- add.s32 %r247, %r11, %r246;
+ add.s32 %r247, %r246, %r12;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd3; cvt.u32.u64 %r272, smem_ptr; }
- add.s32 %r273, %r11, %r272;
+ add.s32 %r273, %r272, %r12;
not.pred %p26, %p3;
mov.f32 %f371, %f370;
mov.f32 %f372, %f370;
mov.f32 %f373, %f370;
mov.f32 %f382, %f370;
@@ -197,16 +197,16 @@
$L__BB0_5:
.pragma "nounroll";
@%p16 bra $L__BB0_8;
- mad.lo.s32 %r244, %r422, %r3, %r7;
- add.s32 %r245, %r244, %r10;
+ mad.lo.s32 %r244, %r420, %r4, %r8;
+ add.s32 %r245, %r244, %r11;
setp.gt.s32 %p17, %r245, 63;
@%p17 bra $L__BB0_8;
- mul.lo.s32 %r249, %r12, %r422;
+ mul.lo.s32 %r249, %r13, %r420;
cvt.s64.s32 %rd67, %r249;
add.s64 %rd68, %rd5, %rd67;
add.s64 %rd69, %rd68, %rd6;
shl.b64 %rd70, %rd69, 2;
add.s64 %rd66, %rd35, %rd70;
@@ -225,53 +225,53 @@
cp.async.wait_all;
@%p16 bra $L__BB0_10;
- add.s32 %r250, %r13, %r422;
- mad.lo.s32 %r251, %r250, %r3, %r7;
+ add.s32 %r250, %r14, %r420;
+ mad.lo.s32 %r251, %r250, %r4, %r8;
setp.lt.s32 %p19, %r251, 64;
@%p19 bra $L__BB0_14;
bra.uni $L__BB0_10;
$L__BB0_14:
- ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
+ ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_10:
@%p1 bra $L__BB0_12;
bra.uni $L__BB0_11;
$L__BB0_12:
- add.s32 %r260, %r13, %r422;
- mad.lo.s32 %r261, %r260, %r3, %r7;
+ add.s32 %r260, %r14, %r420;
+ mad.lo.s32 %r261, %r260, %r4, %r8;
setp.gt.s32 %p20, %r261, 63;
- mov.u32 %r423, 0;
- mov.u32 %r424, %r423;
- mov.u32 %r425, %r423;
- mov.u32 %r426, %r423;
+ mov.u32 %r421, 0;
+ mov.u32 %r422, %r421;
+ mov.u32 %r423, %r421;
+ mov.u32 %r424, %r421;
@%p20 bra $L__BB0_15;
- ld.shared.v4.u32 {%r423, %r424, %r425, %r426}, [%rd7];
+ ld.shared.v4.u32 {%r421, %r422, %r423, %r424}, [%rd7];
bra.uni $L__BB0_15;
$L__BB0_11:
- mov.u32 %r423, 0;
- mov.u32 %r424, %r423;
- mov.u32 %r425, %r423;
- mov.u32 %r426, %r423;
+ mov.u32 %r421, 0;
+ mov.u32 %r422, %r421;
+ mov.u32 %r423, %r421;
+ mov.u32 %r424, %r421;
$L__BB0_15:
- add.s32 %r270, %r13, %r422;
- mad.lo.s32 %r33, %r270, %r3, %r7;
- mov.b32 %f117, %r426;
+ add.s32 %r270, %r14, %r420;
+ mad.lo.s32 %r33, %r270, %r4, %r8;
+ mov.b32 %f117, %r424;
add.f32 %f385, %f385, %f117;
- mov.b32 %f118, %r425;
+ mov.b32 %f118, %r423;
add.f32 %f384, %f384, %f118;
- mov.b32 %f119, %r424;
+ mov.b32 %f119, %r422;
add.f32 %f383, %f383, %f119;
- mov.b32 %f120, %r423;
+ mov.b32 %f120, %r421;
add.f32 %f382, %f382, %f120;
setp.gt.s32 %p21, %r33, 63;
mov.f32 %f368, 0f00000000;
@%p21 bra $L__BB0_17;
@@ -284,11 +284,11 @@
setp.lt.s32 %p22, %r33, 64;
and.pred %p4, %p1, %p22;
not.pred %p23, %p4;
@%p23 bra $L__BB0_19;
- mul.lo.s32 %r275, %r12, %r422;
+ mul.lo.s32 %r275, %r13, %r420;
cvt.s64.s32 %rd75, %r275;
add.s64 %rd76, %rd5, %rd75;
add.s64 %rd77, %rd76, %rd6;
shl.b64 %rd78, %rd77, 2;
add.s64 %rd74, %rd32, %rd78;
@@ -301,13 +301,13 @@
}
$L__BB0_19:
- add.s32 %r421, %r13, %r422;
- mad.lo.s32 %r420, %r421, %r3, %r7;
- setp.gt.s32 %p145, %r420, 63;
+ add.s32 %r419, %r14, %r420;
+ mad.lo.s32 %r418, %r419, %r4, %r8;
+ setp.gt.s32 %p145, %r418, 63;
mov.f32 %f374, 0f00000000;
mov.f32 %f369, %f374;
@%p145 bra $L__BB0_21;
mul.lo.s32 %r276, %r33, %r173;
@@ -364,37 +364,37 @@
$L__BB0_25:
setp.lt.s32 %p27, %r16, 4;
bar.sync 0;
@%p27 bra $L__BB0_30;
- mov.u32 %r427, %r17;
+ mov.u32 %r425, %r17;
$L__BB0_27:
- setp.ge.u32 %p28, %r5, %r427;
+ setp.ge.u32 %p28, %r6, %r425;
@%p28 bra $L__BB0_29;
- add.s32 %r277, %r427, %r15;
+ add.s32 %r277, %r425, %r15;
mul.wide.s32 %rd81, %r277, 4;
add.s64 %rd83, %rd45, %rd81;
ld.shared.f32 %f169, [%rd8];
ld.shared.f32 %f170, [%rd83];
add.f32 %f171, %f170, %f169;
st.shared.f32 [%rd8], %f171;
$L__BB0_29:
bar.sync 0;
- shr.u32 %r35, %r427, 1;
- setp.gt.u32 %p29, %r427, 3;
- mov.u32 %r427, %r35;
+ shr.u32 %r35, %r425, 1;
+ setp.gt.u32 %p29, %r425, 3;
+ mov.u32 %r425, %r35;
@%p29 bra $L__BB0_27;
$L__BB0_30:
- setp.ne.s32 %p30, %r5, 0;
+ setp.ne.s32 %p30, %r6, 0;
mov.f32 %f376, 0f00000000;
@%p30 bra $L__BB0_33;
- setp.lt.u32 %p31, %r2, 2;
+ setp.lt.u32 %p31, %r3, 2;
ld.shared.f32 %f173, [%rd8];
add.f32 %f376, %f173, 0f00000000;
@%p31 bra $L__BB0_33;
ld.shared.f32 %f174, [%rd11];
@@ -414,36 +414,36 @@
$L__BB0_35:
setp.lt.s32 %p142, %r16, 4;
bar.sync 0;
@%p142 bra $L__BB0_40;
- mov.u32 %r428, %r17;
+ mov.u32 %r426, %r17;
$L__BB0_37:
- setp.ge.u32 %p34, %r5, %r428;
+ setp.ge.u32 %p34, %r6, %r426;
@%p34 bra $L__BB0_39;
- add.s32 %r278, %r428, %r15;
+ add.s32 %r278, %r426, %r15;
mul.wide.s32 %rd84, %r278, 4;
add.s64 %rd86, %rd45, %rd84;
ld.shared.f32 %f178, [%rd8];
ld.shared.f32 %f179, [%rd86];
add.f32 %f180, %f179, %f178;
st.shared.f32 [%rd8], %f180;
$L__BB0_39:
bar.sync 0;
- shr.u32 %r37, %r428, 1;
- setp.gt.u32 %p35, %r428, 3;
- mov.u32 %r428, %r37;
+ shr.u32 %r37, %r426, 1;
+ setp.gt.u32 %p35, %r426, 3;
+ mov.u32 %r426, %r37;
@%p35 bra $L__BB0_37;
$L__BB0_40:
mov.f32 %f377, 0f00000000;
@%p30 bra $L__BB0_43;
- setp.lt.u32 %p37, %r2, 2;
+ setp.lt.u32 %p37, %r3, 2;
ld.shared.f32 %f182, [%rd8];
add.f32 %f377, %f182, 0f00000000;
@%p37 bra $L__BB0_43;
ld.shared.f32 %f183, [%rd11];
@@ -507,21 +507,20 @@
sub.f32 %f237, %f233, %f37;
mul.f32 %f238, %f38, %f236;
sub.f32 %f239, %f237, %f238;
mul.f32 %f240, %f184, %f239;
mov.b32 %r282, %f240;
- mad.lo.s32 %r283, %r422, %r3, %r10;
- mad.lo.s32 %r284, %r283, %r176, %r14;
- mul.wide.s32 %rd88, %r284, 4;
+ mad.lo.s32 %r283, %r33, %r176, %r7;
+ mul.wide.s32 %rd88, %r283, 4;
add.s64 %rd87, %rd40, %rd88;
st.global.cs.v4.s32 [%rd87], {%r279,%r280,%r281,%r282};
$L__BB0_49:
- add.s32 %r422, %r422, 1;
- setp.lt.s32 %p41, %r422, %r9;
+ add.s32 %r420, %r420, 1;
+ setp.lt.s32 %p41, %r420, %r10;
@%p41 bra $L__BB0_5;
bra.uni $L__BB0_50;
$L__BB0_3:
mov.f32 %f370, 0f00000000;
@@ -532,31 +531,31 @@
mov.f32 %f383, %f370;
mov.f32 %f384, %f370;
mov.f32 %f385, %f370;
$L__BB0_50:
- mov.u32 %r285, %tid.z;
- mad.lo.s32 %r286, %r3, %r285, %r7;
- mad.lo.s32 %r39, %r286, %r2, %r5;
+ mov.u32 %r284, %tid.z;
+ mad.lo.s32 %r285, %r4, %r284, %r8;
+ mad.lo.s32 %r39, %r285, %r3, %r6;
mul.wide.u32 %rd89, %r39, 4;
add.s64 %rd23, %rd45, %rd89;
- clz.b32 %r287, %r3;
- mov.u32 %r288, 31;
- sub.s32 %r289, %r288, %r287;
- mov.u32 %r290, 1;
- shl.b32 %r40, %r290, %r289;
- setp.lt.u32 %p42, %r7, %r40;
- add.s32 %r291, %r40, %r7;
- setp.lt.u32 %p43, %r291, %r3;
+ clz.b32 %r286, %r4;
+ mov.u32 %r287, 31;
+ sub.s32 %r288, %r287, %r286;
+ mov.u32 %r289, 1;
+ shl.b32 %r40, %r289, %r288;
+ setp.lt.u32 %p42, %r8, %r40;
+ add.s32 %r290, %r40, %r8;
+ setp.lt.u32 %p43, %r290, %r4;
and.pred %p5, %p42, %p43;
- shl.b32 %r292, %r2, %r289;
- add.s32 %r293, %r39, %r292;
- mul.wide.s32 %rd91, %r293, 4;
+ shl.b32 %r291, %r3, %r288;
+ add.s32 %r292, %r39, %r291;
+ mul.wide.s32 %rd91, %r292, 4;
add.s64 %rd24, %rd45, %rd91;
- shr.u32 %r294, %r40, 31;
- add.s32 %r295, %r40, %r294;
- shr.s32 %r443, %r295, 1;
+ shr.u32 %r293, %r40, 31;
+ add.s32 %r294, %r40, %r293;
+ shr.s32 %r441, %r294, 1;
st.shared.f32 [%rd23], %f382;
bar.sync 0;
not.pred %p44, %p5;
@%p44 bra $L__BB0_52;
@@ -568,49 +567,49 @@
$L__BB0_52:
setp.lt.s32 %p45, %r40, 4;
bar.sync 0;
@%p45 bra $L__BB0_57;
- mov.u32 %r429, %r443;
+ mov.u32 %r427, %r441;
$L__BB0_54:
- setp.ge.u32 %p46, %r7, %r429;
+ setp.ge.u32 %p46, %r8, %r427;
@%p46 bra $L__BB0_56;
- mad.lo.s32 %r296, %r429, %r2, %r39;
- mul.wide.s32 %rd92, %r296, 4;
+ mad.lo.s32 %r295, %r427, %r3, %r39;
+ mul.wide.s32 %rd92, %r295, 4;
add.s64 %rd94, %rd45, %rd92;
ld.shared.f32 %f244, [%rd23];
ld.shared.f32 %f245, [%rd94];
add.f32 %f246, %f245, %f244;
st.shared.f32 [%rd23], %f246;
$L__BB0_56:
bar.sync 0;
- shr.u32 %r43, %r429, 1;
- setp.gt.u32 %p47, %r429, 3;
- mov.u32 %r429, %r43;
+ shr.u32 %r43, %r427, 1;
+ setp.gt.u32 %p47, %r427, 3;
+ mov.u32 %r427, %r43;
@%p47 bra $L__BB0_54;
$L__BB0_57:
- add.s32 %r298, %r39, %r2;
- mul.wide.u32 %rd95, %r298, 4;
+ add.s32 %r297, %r39, %r3;
+ mul.wide.u32 %rd95, %r297, 4;
add.s64 %rd25, %rd45, %rd95;
- setp.ne.s32 %p48, %r7, 0;
- mov.u32 %r430, 0;
+ setp.ne.s32 %p48, %r8, 0;
+ mov.u32 %r428, 0;
@%p48 bra $L__BB0_61;
- setp.lt.u32 %p49, %r3, 2;
+ setp.lt.u32 %p49, %r4, 2;
ld.shared.f32 %f247, [%rd23];
add.f32 %f386, %f247, 0f00000000;
@%p49 bra $L__BB0_60;
ld.shared.f32 %f248, [%rd25];
add.f32 %f386, %f386, %f248;
$L__BB0_60:
- mov.b32 %r430, %f386;
+ mov.b32 %r428, %f386;
$L__BB0_61:
bar.sync 0;
st.shared.f32 [%rd23], %f383;
bar.sync 0;
@@ -623,45 +622,45 @@
$L__BB0_63:
bar.sync 0;
@%p45 bra $L__BB0_68;
- mov.u32 %r431, %r443;
+ mov.u32 %r429, %r441;
$L__BB0_65:
- setp.ge.u32 %p52, %r7, %r431;
+ setp.ge.u32 %p52, %r8, %r429;
@%p52 bra $L__BB0_67;
- mad.lo.s32 %r299, %r431, %r2, %r39;
- mul.wide.s32 %rd97, %r299, 4;
+ mad.lo.s32 %r298, %r429, %r3, %r39;
+ mul.wide.s32 %rd97, %r298, 4;
add.s64 %rd99, %rd45, %rd97;
ld.shared.f32 %f252, [%rd23];
ld.shared.f32 %f253, [%rd99];
add.f32 %f254, %f253, %f252;
st.shared.f32 [%rd23], %f254;
$L__BB0_67:
bar.sync 0;
- shr.u32 %r47, %r431, 1;
- setp.gt.u32 %p53, %r431, 3;
- mov.u32 %r431, %r47;
+ shr.u32 %r47, %r429, 1;
+ setp.gt.u32 %p53, %r429, 3;
+ mov.u32 %r429, %r47;
@%p53 bra $L__BB0_65;
$L__BB0_68:
- mov.u32 %r432, 0;
+ mov.u32 %r430, 0;
@%p48 bra $L__BB0_72;
- setp.lt.u32 %p55, %r3, 2;
+ setp.lt.u32 %p55, %r4, 2;
ld.shared.f32 %f255, [%rd23];
add.f32 %f387, %f255, 0f00000000;
@%p55 bra $L__BB0_71;
ld.shared.f32 %f256, [%rd25];
add.f32 %f387, %f387, %f256;
$L__BB0_71:
- mov.b32 %r432, %f387;
+ mov.b32 %r430, %f387;
$L__BB0_72:
bar.sync 0;
st.shared.f32 [%rd23], %f384;
bar.sync 0;
@@ -674,45 +673,45 @@
$L__BB0_74:
bar.sync 0;
@%p45 bra $L__BB0_79;
- mov.u32 %r433, %r443;
+ mov.u32 %r431, %r441;
$L__BB0_76:
- setp.ge.u32 %p58, %r7, %r433;
+ setp.ge.u32 %p58, %r8, %r431;
@%p58 bra $L__BB0_78;
- mad.lo.s32 %r301, %r433, %r2, %r39;
- mul.wide.s32 %rd100, %r301, 4;
+ mad.lo.s32 %r300, %r431, %r3, %r39;
+ mul.wide.s32 %rd100, %r300, 4;
add.s64 %rd102, %rd45, %rd100;
ld.shared.f32 %f260, [%rd23];
ld.shared.f32 %f261, [%rd102];
add.f32 %f262, %f261, %f260;
st.shared.f32 [%rd23], %f262;
$L__BB0_78:
bar.sync 0;
- shr.u32 %r51, %r433, 1;
- setp.gt.u32 %p59, %r433, 3;
- mov.u32 %r433, %r51;
+ shr.u32 %r51, %r431, 1;
+ setp.gt.u32 %p59, %r431, 3;
+ mov.u32 %r431, %r51;
@%p59 bra $L__BB0_76;
$L__BB0_79:
- mov.u32 %r434, 0;
+ mov.u32 %r432, 0;
@%p48 bra $L__BB0_83;
- setp.lt.u32 %p61, %r3, 2;
+ setp.lt.u32 %p61, %r4, 2;
ld.shared.f32 %f263, [%rd23];
add.f32 %f388, %f263, 0f00000000;
@%p61 bra $L__BB0_82;
ld.shared.f32 %f264, [%rd25];
add.f32 %f388, %f388, %f264;
$L__BB0_82:
- mov.b32 %r434, %f388;
+ mov.b32 %r432, %f388;
$L__BB0_83:
bar.sync 0;
st.shared.f32 [%rd23], %f385;
bar.sync 0;
@@ -725,45 +724,45 @@
$L__BB0_85:
bar.sync 0;
@%p45 bra $L__BB0_90;
- mov.u32 %r435, %r443;
+ mov.u32 %r433, %r441;
$L__BB0_87:
- setp.ge.u32 %p64, %r7, %r435;
+ setp.ge.u32 %p64, %r8, %r433;
@%p64 bra $L__BB0_89;
- mad.lo.s32 %r303, %r435, %r2, %r39;
- mul.wide.s32 %rd103, %r303, 4;
+ mad.lo.s32 %r302, %r433, %r3, %r39;
+ mul.wide.s32 %rd103, %r302, 4;
add.s64 %rd105, %rd45, %rd103;
ld.shared.f32 %f268, [%rd23];
ld.shared.f32 %f269, [%rd105];
add.f32 %f270, %f269, %f268;
st.shared.f32 [%rd23], %f270;
$L__BB0_89:
bar.sync 0;
- shr.u32 %r55, %r435, 1;
- setp.gt.u32 %p65, %r435, 3;
- mov.u32 %r435, %r55;
+ shr.u32 %r55, %r433, 1;
+ setp.gt.u32 %p65, %r433, 3;
+ mov.u32 %r433, %r55;
@%p65 bra $L__BB0_87;
$L__BB0_90:
- mov.u32 %r436, 0;
+ mov.u32 %r434, 0;
@%p48 bra $L__BB0_94;
- setp.lt.u32 %p67, %r3, 2;
+ setp.lt.u32 %p67, %r4, 2;
ld.shared.f32 %f271, [%rd23];
add.f32 %f389, %f271, 0f00000000;
@%p67 bra $L__BB0_93;
ld.shared.f32 %f272, [%rd25];
add.f32 %f389, %f389, %f272;
$L__BB0_93:
- mov.b32 %r436, %f389;
+ mov.b32 %r434, %f389;
$L__BB0_94:
bar.sync 0;
st.shared.f32 [%rd23], %f370;
bar.sync 0;
@@ -776,45 +775,45 @@
$L__BB0_96:
bar.sync 0;
@%p45 bra $L__BB0_101;
- mov.u32 %r437, %r443;
+ mov.u32 %r435, %r441;
$L__BB0_98:
- setp.ge.u32 %p70, %r7, %r437;
+ setp.ge.u32 %p70, %r8, %r435;
@%p70 bra $L__BB0_100;
- mad.lo.s32 %r305, %r437, %r2, %r39;
- mul.wide.s32 %rd106, %r305, 4;
+ mad.lo.s32 %r304, %r435, %r3, %r39;
+ mul.wide.s32 %rd106, %r304, 4;
add.s64 %rd108, %rd45, %rd106;
ld.shared.f32 %f276, [%rd23];
ld.shared.f32 %f277, [%rd108];
add.f32 %f278, %f277, %f276;
st.shared.f32 [%rd23], %f278;
$L__BB0_100:
bar.sync 0;
- shr.u32 %r59, %r437, 1;
- setp.gt.u32 %p71, %r437, 3;
- mov.u32 %r437, %r59;
+ shr.u32 %r59, %r435, 1;
+ setp.gt.u32 %p71, %r435, 3;
+ mov.u32 %r435, %r59;
@%p71 bra $L__BB0_98;
$L__BB0_101:
- mov.u32 %r438, 0;
+ mov.u32 %r436, 0;
@%p48 bra $L__BB0_105;
- setp.lt.u32 %p73, %r3, 2;
+ setp.lt.u32 %p73, %r4, 2;
ld.shared.f32 %f279, [%rd23];
add.f32 %f390, %f279, 0f00000000;
@%p73 bra $L__BB0_104;
ld.shared.f32 %f280, [%rd25];
add.f32 %f390, %f390, %f280;
$L__BB0_104:
- mov.b32 %r438, %f390;
+ mov.b32 %r436, %f390;
$L__BB0_105:
bar.sync 0;
st.shared.f32 [%rd23], %f371;
bar.sync 0;
@@ -827,45 +826,45 @@
$L__BB0_107:
bar.sync 0;
@%p45 bra $L__BB0_112;
- mov.u32 %r439, %r443;
+ mov.u32 %r437, %r441;
$L__BB0_109:
- setp.ge.u32 %p76, %r7, %r439;
+ setp.ge.u32 %p76, %r8, %r437;
@%p76 bra $L__BB0_111;
- mad.lo.s32 %r307, %r439, %r2, %r39;
- mul.wide.s32 %rd109, %r307, 4;
+ mad.lo.s32 %r306, %r437, %r3, %r39;
+ mul.wide.s32 %rd109, %r306, 4;
add.s64 %rd111, %rd45, %rd109;
ld.shared.f32 %f284, [%rd23];
ld.shared.f32 %f285, [%rd111];
add.f32 %f286, %f285, %f284;
st.shared.f32 [%rd23], %f286;
$L__BB0_111:
bar.sync 0;
- shr.u32 %r63, %r439, 1;
- setp.gt.u32 %p77, %r439, 3;
- mov.u32 %r439, %r63;
+ shr.u32 %r63, %r437, 1;
+ setp.gt.u32 %p77, %r437, 3;
+ mov.u32 %r437, %r63;
@%p77 bra $L__BB0_109;
$L__BB0_112:
- mov.u32 %r440, 0;
+ mov.u32 %r438, 0;
@%p48 bra $L__BB0_116;
- setp.lt.u32 %p79, %r3, 2;
+ setp.lt.u32 %p79, %r4, 2;
ld.shared.f32 %f287, [%rd23];
add.f32 %f391, %f287, 0f00000000;
@%p79 bra $L__BB0_115;
ld.shared.f32 %f288, [%rd25];
add.f32 %f391, %f391, %f288;
$L__BB0_115:
- mov.b32 %r440, %f391;
+ mov.b32 %r438, %f391;
$L__BB0_116:
bar.sync 0;
st.shared.f32 [%rd23], %f372;
bar.sync 0;
@@ -878,45 +877,45 @@
$L__BB0_118:
bar.sync 0;
@%p45 bra $L__BB0_123;
- mov.u32 %r441, %r443;
+ mov.u32 %r439, %r441;
$L__BB0_120:
- setp.ge.u32 %p82, %r7, %r441;
+ setp.ge.u32 %p82, %r8, %r439;
@%p82 bra $L__BB0_122;
- mad.lo.s32 %r309, %r441, %r2, %r39;
- mul.wide.s32 %rd112, %r309, 4;
+ mad.lo.s32 %r308, %r439, %r3, %r39;
+ mul.wide.s32 %rd112, %r308, 4;
add.s64 %rd114, %rd45, %rd112;
ld.shared.f32 %f292, [%rd23];
ld.shared.f32 %f293, [%rd114];
add.f32 %f294, %f293, %f292;
st.shared.f32 [%rd23], %f294;
$L__BB0_122:
bar.sync 0;
- shr.u32 %r67, %r441, 1;
- setp.gt.u32 %p83, %r441, 3;
- mov.u32 %r441, %r67;
+ shr.u32 %r67, %r439, 1;
+ setp.gt.u32 %p83, %r439, 3;
+ mov.u32 %r439, %r67;
@%p83 bra $L__BB0_120;
$L__BB0_123:
- mov.u32 %r442, 0;
+ mov.u32 %r440, 0;
@%p48 bra $L__BB0_127;
- setp.lt.u32 %p85, %r3, 2;
+ setp.lt.u32 %p85, %r4, 2;
ld.shared.f32 %f295, [%rd23];
add.f32 %f392, %f295, 0f00000000;
@%p85 bra $L__BB0_126;
ld.shared.f32 %f296, [%rd25];
add.f32 %f392, %f392, %f296;
$L__BB0_126:
- mov.b32 %r442, %f392;
+ mov.b32 %r440, %f392;
$L__BB0_127:
bar.sync 0;
st.shared.f32 [%rd23], %f373;
bar.sync 0;
@@ -930,185 +929,184 @@
$L__BB0_129:
bar.sync 0;
@%p45 bra $L__BB0_133;
$L__BB0_130:
- setp.ge.u32 %p88, %r7, %r443;
+ setp.ge.u32 %p88, %r8, %r441;
@%p88 bra $L__BB0_132;
- mad.lo.s32 %r311, %r443, %r2, %r39;
- mul.wide.s32 %rd115, %r311, 4;
+ mad.lo.s32 %r310, %r441, %r3, %r39;
+ mul.wide.s32 %rd115, %r310, 4;
add.s64 %rd117, %rd45, %rd115;
ld.shared.f32 %f300, [%rd23];
ld.shared.f32 %f301, [%rd117];
add.f32 %f302, %f301, %f300;
st.shared.f32 [%rd23], %f302;
$L__BB0_132:
bar.sync 0;
- shr.u32 %r71, %r443, 1;
- setp.gt.u32 %p89, %r443, 3;
- mov.u32 %r443, %r71;
+ shr.u32 %r71, %r441, 1;
+ setp.gt.u32 %p89, %r441, 3;
+ mov.u32 %r441, %r71;
@%p89 bra $L__BB0_130;
$L__BB0_133:
- mov.u32 %r444, 0;
+ mov.u32 %r442, 0;
@%p48 bra $L__BB0_137;
- setp.lt.u32 %p91, %r3, 2;
+ setp.lt.u32 %p91, %r4, 2;
ld.shared.f32 %f303, [%rd23];
add.f32 %f393, %f303, 0f00000000;
@%p91 bra $L__BB0_136;
ld.shared.f32 %f304, [%rd25];
add.f32 %f393, %f393, %f304;
$L__BB0_136:
- mov.b32 %r444, %f393;
+ mov.b32 %r442, %f393;
$L__BB0_137:
- setp.eq.s32 %p144, %r7, 0;
+ setp.eq.s32 %p144, %r8, 0;
and.pred %p143, %p144, %p1;
bar.sync 0;
@%p143 bra $L__BB0_138;
bra.uni $L__BB0_139;
$L__BB0_138:
- shl.b32 %r419, %r5, 2;
- mov.u32 %r321, %ctaid.y;
- mad.lo.s32 %r322, %r176, %r321, %r419;
- mul.wide.s32 %rd120, %r322, 4;
+ mov.u32 %r320, %ctaid.y;
+ mad.lo.s32 %r321, %r176, %r320, %r7;
+ mul.wide.s32 %rd120, %r321, 4;
add.s64 %rd118, %rd42, %rd120;
- st.volatile.global.v4.s32 [%rd118], {%r430,%r432,%r434,%r436};
+ st.volatile.global.v4.s32 [%rd118], {%r428,%r430,%r432,%r434};
add.s64 %rd119, %rd43, %rd120;
- st.volatile.global.v4.s32 [%rd119], {%r438,%r440,%r442,%r444};
+ st.volatile.global.v4.s32 [%rd119], {%r436,%r438,%r440,%r442};
$L__BB0_139:
mov.u32 %r74, %ctaid.y;
membar.gl;
bar.sync 0;
- or.b32 %r323, %r5, %r7;
- or.b32 %r325, %r323, %r285;
- setp.ne.s32 %p92, %r325, 0;
+ or.b32 %r322, %r6, %r8;
+ or.b32 %r324, %r322, %r284;
+ setp.ne.s32 %p92, %r324, 0;
@%p92 bra $L__BB0_143;
ld.param.u64 %rd161, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_12];
cvta.to.global.u64 %rd121, %rd161;
- mov.u32 %r326, %ctaid.x;
- mov.u32 %r327, %ctaid.z;
- mov.u32 %r328, %nctaid.x;
- mad.lo.s32 %r329, %r327, %r328, %r326;
- mul.wide.s32 %rd122, %r329, 8;
+ mov.u32 %r325, %ctaid.x;
+ mov.u32 %r326, %ctaid.z;
+ mov.u32 %r327, %nctaid.x;
+ mad.lo.s32 %r328, %r326, %r327, %r325;
+ mul.wide.s32 %rd122, %r328, 8;
add.s64 %rd28, %rd121, %rd122;
- add.s32 %r330, %r8, -1;
- setp.eq.s32 %p93, %r74, %r330;
- cvt.s64.s32 %rd123, %r8;
+ add.s32 %r329, %r9, -1;
+ setp.eq.s32 %p93, %r74, %r329;
+ cvt.s64.s32 %rd123, %r9;
mov.u64 %rd124, -9223372036854775807;
sub.s64 %rd125, %rd124, %rd123;
selp.b64 %rd126, %rd125, 1, %p93;
atom.global.add.u64 %rd29, [%rd28], %rd126;
ld.volatile.global.u64 %rd127, [%rd28];
xor.b64 %rd128, %rd127, %rd29;
setp.lt.s64 %p94, %rd128, 0;
@%p94 bra $L__BB0_143;
- mov.u32 %r445, 8;
+ mov.u32 %r443, 8;
$L__BB0_142:
- nanosleep.u32 %r445;
-
- setp.lt.u32 %p95, %r445, 256;
- selp.u32 %r333, 1, 0, %p95;
- shl.b32 %r445, %r445, %r333;
+ nanosleep.u32 %r443;
+
+ setp.lt.u32 %p95, %r443, 256;
+ selp.u32 %r332, 1, 0, %p95;
+ shl.b32 %r443, %r443, %r332;
ld.volatile.global.u64 %rd129, [%rd28];
xor.b64 %rd130, %rd129, %rd29;
setp.gt.s64 %p96, %rd130, -1;
@%p96 bra $L__BB0_142;
$L__BB0_143:
bar.sync 0;
- add.s32 %r334, %r8, %r2;
- add.s32 %r335, %r334, -1;
- div.s32 %r77, %r335, %r2;
+ add.s32 %r333, %r9, %r3;
+ add.s32 %r334, %r333, -1;
+ div.s32 %r77, %r334, %r3;
setp.lt.s32 %p97, %r77, 1;
mov.f32 %f396, 0f00000000;
mov.f32 %f397, %f396;
@%p97 bra $L__BB0_149;
- add.s32 %r337, %r176, 1;
- shr.u32 %r338, %r337, 31;
- add.s32 %r339, %r337, %r338;
- shr.s32 %r340, %r339, 1;
- add.s32 %r341, %r3, %r340;
- add.s32 %r342, %r341, -1;
- shl.b32 %r343, %r7, 1;
- shl.b32 %r344, %r3, 1;
- mad.lo.s32 %r345, %r344, %r74, %r343;
- or.b32 %r346, %r345, 1;
- setp.ge.s32 %p98, %r346, %r176;
- div.s32 %r347, %r342, %r3;
- setp.ge.s32 %p99, %r74, %r347;
+ add.s32 %r336, %r176, 1;
+ shr.u32 %r337, %r336, 31;
+ add.s32 %r338, %r336, %r337;
+ shr.s32 %r339, %r338, 1;
+ add.s32 %r340, %r4, %r339;
+ add.s32 %r341, %r340, -1;
+ shl.b32 %r342, %r8, 1;
+ shl.b32 %r343, %r4, 1;
+ mad.lo.s32 %r344, %r343, %r74, %r342;
+ or.b32 %r345, %r344, 1;
+ setp.ge.s32 %p98, %r345, %r176;
+ div.s32 %r346, %r341, %r4;
+ setp.ge.s32 %p99, %r74, %r346;
or.pred %p6, %p99, %p98;
- mul.lo.s32 %r348, %r3, %r74;
- shl.b32 %r349, %r348, 1;
- mad.lo.s32 %r350, %r176, %r5, %r349;
- add.s32 %r447, %r350, %r343;
- mul.lo.s32 %r79, %r176, %r2;
- mov.u32 %r336, 0;
+ mul.lo.s32 %r347, %r4, %r74;
+ shl.b32 %r348, %r347, 1;
+ mad.lo.s32 %r349, %r176, %r6, %r348;
+ add.s32 %r445, %r349, %r342;
+ mul.lo.s32 %r79, %r176, %r3;
+ mov.u32 %r335, 0;
mov.f32 %f396, 0f00000000;
- mov.u32 %r446, %r5;
- mov.u32 %r448, %r336;
+ mov.u32 %r444, %r6;
+ mov.u32 %r446, %r335;
$L__BB0_145:
.pragma "nounroll";
- mov.u32 %r449, %r336;
- mov.u32 %r450, %r336;
+ mov.u32 %r447, %r335;
+ mov.u32 %r448, %r335;
@%p6 bra $L__BB0_148;
- setp.ge.s32 %p100, %r446, %r8;
- mov.u32 %r449, %r336;
- mov.u32 %r450, %r336;
+ setp.ge.s32 %p100, %r444, %r9;
+ mov.u32 %r447, %r335;
+ mov.u32 %r448, %r335;
@%p100 bra $L__BB0_148;
- mul.wide.s32 %rd132, %r447, 4;
+ mul.wide.s32 %rd132, %r445, 4;
add.s64 %rd131, %rd43, %rd132;
- ld.volatile.global.v2.s32 {%r450,%r449}, [%rd131];
+ ld.volatile.global.v2.s32 {%r448,%r447}, [%rd131];
$L__BB0_148:
- mov.b32 %f309, %r450;
+ mov.b32 %f309, %r448;
add.f32 %f396, %f396, %f309;
- mov.b32 %f310, %r449;
+ mov.b32 %f310, %r447;
add.f32 %f397, %f397, %f310;
- add.s32 %r447, %r447, %r79;
- add.s32 %r446, %r446, %r2;
- add.s32 %r448, %r448, 1;
- setp.lt.s32 %p101, %r448, %r77;
+ add.s32 %r445, %r445, %r79;
+ add.s32 %r444, %r444, %r3;
+ add.s32 %r446, %r446, 1;
+ setp.lt.s32 %p101, %r446, %r77;
@%p101 bra $L__BB0_145;
$L__BB0_149:
- clz.b32 %r357, %r2;
- mov.u32 %r358, 31;
- sub.s32 %r359, %r358, %r357;
- mov.u32 %r360, 1;
- shl.b32 %r90, %r360, %r359;
- setp.lt.u32 %p102, %r5, %r90;
- add.s32 %r361, %r90, %r5;
- setp.lt.u32 %p103, %r361, %r2;
+ clz.b32 %r356, %r3;
+ mov.u32 %r357, 31;
+ sub.s32 %r358, %r357, %r356;
+ mov.u32 %r359, 1;
+ shl.b32 %r90, %r359, %r358;
+ setp.lt.u32 %p102, %r6, %r90;
+ add.s32 %r360, %r90, %r6;
+ setp.lt.u32 %p103, %r360, %r3;
and.pred %p7, %p102, %p103;
- add.s32 %r362, %r39, %r90;
- mul.wide.s32 %rd133, %r362, 4;
+ add.s32 %r361, %r39, %r90;
+ mul.wide.s32 %rd133, %r361, 4;
add.s64 %rd30, %rd45, %rd133;
- shr.u32 %r363, %r90, 31;
- add.s32 %r364, %r90, %r363;
- shr.s32 %r461, %r364, 1;
+ shr.u32 %r362, %r90, 31;
+ add.s32 %r363, %r90, %r362;
+ shr.s32 %r459, %r363, 1;
st.shared.f32 [%rd23], %f396;
bar.sync 0;
not.pred %p104, %p7;
@%p104 bra $L__BB0_151;
@@ -1120,49 +1118,49 @@
$L__BB0_151:
setp.lt.s32 %p105, %r90, 4;
bar.sync 0;
@%p105 bra $L__BB0_156;
- mov.u32 %r451, %r461;
+ mov.u32 %r449, %r459;
$L__BB0_153:
- setp.ge.u32 %p106, %r5, %r451;
+ setp.ge.u32 %p106, %r6, %r449;
@%p106 bra $L__BB0_155;
- add.s32 %r365, %r451, %r39;
- mul.wide.s32 %rd135, %r365, 4;
+ add.s32 %r364, %r449, %r39;
+ mul.wide.s32 %rd135, %r364, 4;
add.s64 %rd137, %rd45, %rd135;
ld.shared.f32 %f314, [%rd23];
ld.shared.f32 %f315, [%rd137];
add.f32 %f316, %f315, %f314;
st.shared.f32 [%rd23], %f316;
$L__BB0_155:
bar.sync 0;
- shr.u32 %r93, %r451, 1;
- setp.gt.u32 %p107, %r451, 3;
- mov.u32 %r451, %r93;
+ shr.u32 %r93, %r449, 1;
+ setp.gt.u32 %p107, %r449, 3;
+ mov.u32 %r449, %r93;
@%p107 bra $L__BB0_153;
$L__BB0_156:
- add.s32 %r367, %r39, 1;
- mul.wide.u32 %rd138, %r367, 4;
+ add.s32 %r366, %r39, 1;
+ mul.wide.u32 %rd138, %r366, 4;
add.s64 %rd31, %rd45, %rd138;
- setp.ne.s32 %p108, %r5, 0;
- mov.u32 %r452, 0;
+ setp.ne.s32 %p108, %r6, 0;
+ mov.u32 %r450, 0;
@%p108 bra $L__BB0_160;
- setp.lt.u32 %p109, %r2, 2;
+ setp.lt.u32 %p109, %r3, 2;
ld.shared.f32 %f317, [%rd23];
add.f32 %f398, %f317, 0f00000000;
@%p109 bra $L__BB0_159;
ld.shared.f32 %f318, [%rd31];
add.f32 %f398, %f398, %f318;
$L__BB0_159:
- mov.b32 %r452, %f398;
+ mov.b32 %r450, %f398;
$L__BB0_160:
bar.sync 0;
st.shared.f32 [%rd23], %f397;
bar.sync 0;
@@ -1175,158 +1173,158 @@
$L__BB0_162:
bar.sync 0;
@%p105 bra $L__BB0_167;
- mov.u32 %r453, %r461;
+ mov.u32 %r451, %r459;
$L__BB0_164:
- setp.ge.u32 %p112, %r5, %r453;
+ setp.ge.u32 %p112, %r6, %r451;
@%p112 bra $L__BB0_166;
- add.s32 %r368, %r453, %r39;
- mul.wide.s32 %rd140, %r368, 4;
+ add.s32 %r367, %r451, %r39;
+ mul.wide.s32 %rd140, %r367, 4;
add.s64 %rd142, %rd45, %rd140;
ld.shared.f32 %f322, [%rd23];
ld.shared.f32 %f323, [%rd142];
add.f32 %f324, %f323, %f322;
st.shared.f32 [%rd23], %f324;
$L__BB0_166:
bar.sync 0;
- shr.u32 %r97, %r453, 1;
- setp.gt.u32 %p113, %r453, 3;
- mov.u32 %r453, %r97;
+ shr.u32 %r97, %r451, 1;
+ setp.gt.u32 %p113, %r451, 3;
+ mov.u32 %r451, %r97;
@%p113 bra $L__BB0_164;
$L__BB0_167:
- mov.u32 %r454, 0;
+ mov.u32 %r452, 0;
@%p108 bra $L__BB0_171;
- setp.lt.u32 %p115, %r2, 2;
+ setp.lt.u32 %p115, %r3, 2;
ld.shared.f32 %f325, [%rd23];
add.f32 %f399, %f325, 0f00000000;
@%p115 bra $L__BB0_170;
ld.shared.f32 %f326, [%rd31];
add.f32 %f399, %f399, %f326;
$L__BB0_170:
- mov.b32 %r454, %f399;
+ mov.b32 %r452, %f399;
$L__BB0_171:
bar.sync 0;
- setp.eq.s32 %p116, %r5, 0;
+ setp.eq.s32 %p116, %r6, 0;
@%p116 bra $L__BB0_172;
bra.uni $L__BB0_175;
$L__BB0_172:
- add.s32 %r370, %r176, 1;
- shr.u32 %r371, %r370, 31;
- add.s32 %r372, %r370, %r371;
- shr.s32 %r373, %r372, 1;
- add.s32 %r374, %r3, %r373;
- add.s32 %r375, %r374, -1;
- div.s32 %r376, %r375, %r3;
- setp.ge.s32 %p117, %r74, %r376;
+ add.s32 %r369, %r176, 1;
+ shr.u32 %r370, %r369, 31;
+ add.s32 %r371, %r369, %r370;
+ shr.s32 %r372, %r371, 1;
+ add.s32 %r373, %r4, %r372;
+ add.s32 %r374, %r373, -1;
+ div.s32 %r375, %r374, %r4;
+ setp.ge.s32 %p117, %r74, %r375;
@%p117 bra $L__BB0_175;
- shl.b32 %r100, %r7, 1;
- mul.lo.s32 %r377, %r3, %r74;
- shl.b32 %r101, %r377, 1;
- add.s32 %r378, %r100, %r101;
- or.b32 %r379, %r378, 1;
- setp.ge.s32 %p118, %r379, %r176;
+ shl.b32 %r100, %r8, 1;
+ mul.lo.s32 %r376, %r4, %r74;
+ shl.b32 %r101, %r376, 1;
+ add.s32 %r377, %r100, %r101;
+ or.b32 %r378, %r377, 1;
+ setp.ge.s32 %p118, %r378, %r176;
@%p118 bra $L__BB0_175;
ld.param.u64 %rd160, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_7];
- add.s32 %r382, %r101, %r100;
- mul.wide.s32 %rd144, %r382, 4;
+ add.s32 %r381, %r101, %r100;
+ mul.wide.s32 %rd144, %r381, 4;
add.s64 %rd143, %rd160, %rd144;
- st.global.cs.v2.s32 [%rd143], {%r452,%r454};
+ st.global.cs.v2.s32 [%rd143], {%r450,%r452};
$L__BB0_175:
- add.s32 %r383, %r176, 1;
- shr.u32 %r384, %r383, 31;
- add.s32 %r385, %r383, %r384;
- shr.s32 %r386, %r385, 1;
- add.s32 %r387, %r3, %r386;
- add.s32 %r388, %r387, -1;
- div.s32 %r102, %r388, %r3;
+ add.s32 %r382, %r176, 1;
+ shr.u32 %r383, %r382, 31;
+ add.s32 %r384, %r382, %r383;
+ shr.s32 %r385, %r384, 1;
+ add.s32 %r386, %r4, %r385;
+ add.s32 %r387, %r386, -1;
+ div.s32 %r102, %r387, %r4;
setp.ge.s32 %p119, %r74, %r102;
mov.f32 %f400, 0f00000000;
mov.f32 %f404, 0f00000000;
mov.f32 %f401, %f404;
@%p119 bra $L__BB0_178;
- shl.b32 %r103, %r7, 1;
- mul.lo.s32 %r389, %r3, %r74;
- shl.b32 %r104, %r389, 1;
- add.s32 %r390, %r103, %r104;
- or.b32 %r391, %r390, 1;
- setp.ge.s32 %p120, %r391, %r176;
+ shl.b32 %r103, %r8, 1;
+ mul.lo.s32 %r388, %r4, %r74;
+ shl.b32 %r104, %r388, 1;
+ add.s32 %r389, %r103, %r104;
+ or.b32 %r390, %r389, 1;
+ setp.ge.s32 %p120, %r390, %r176;
@%p120 bra $L__BB0_178;
ld.param.u64 %rd159, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_5];
- add.s32 %r394, %r104, %r103;
- mul.wide.s32 %rd146, %r394, 4;
+ add.s32 %r393, %r104, %r103;
+ mul.wide.s32 %rd146, %r393, 4;
add.s64 %rd145, %rd159, %rd146;
- ld.global.cs.v2.u32 {%r392,%r393}, [%rd145];
-
- mov.b32 %f400, %r392;
- mov.b32 %f401, %r393;
+ ld.global.cs.v2.u32 {%r391,%r392}, [%rd145];
+
+ mov.b32 %f400, %r391;
+ mov.b32 %f401, %r392;
$L__BB0_178:
mov.f32 %f405, %f404;
@%p97 bra $L__BB0_184;
- shl.b32 %r396, %r7, 1;
- shl.b32 %r397, %r3, 1;
- mad.lo.s32 %r398, %r397, %r74, %r396;
- or.b32 %r399, %r398, 1;
- setp.ge.s32 %p122, %r399, %r176;
+ shl.b32 %r395, %r8, 1;
+ shl.b32 %r396, %r4, 1;
+ mad.lo.s32 %r397, %r396, %r74, %r395;
+ or.b32 %r398, %r397, 1;
+ setp.ge.s32 %p122, %r398, %r176;
or.pred %p8, %p122, %p119;
- mul.lo.s32 %r400, %r3, %r74;
- shl.b32 %r401, %r400, 1;
- mad.lo.s32 %r402, %r176, %r5, %r401;
- add.s32 %r456, %r402, %r396;
- mul.lo.s32 %r106, %r176, %r2;
- mov.u32 %r395, 0;
+ mul.lo.s32 %r399, %r4, %r74;
+ shl.b32 %r400, %r399, 1;
+ mad.lo.s32 %r401, %r176, %r6, %r400;
+ add.s32 %r454, %r401, %r395;
+ mul.lo.s32 %r106, %r176, %r3;
+ mov.u32 %r394, 0;
mov.f32 %f404, 0f00000000;
- mov.u32 %r455, %r5;
+ mov.u32 %r453, %r6;
mov.f32 %f405, %f404;
- mov.u32 %r457, %r395;
+ mov.u32 %r455, %r394;
$L__BB0_180:
.pragma "nounroll";
- mov.u32 %r458, %r395;
- mov.u32 %r459, %r395;
+ mov.u32 %r456, %r394;
+ mov.u32 %r457, %r394;
@%p8 bra $L__BB0_183;
- setp.ge.s32 %p124, %r455, %r8;
- mov.u32 %r458, %r395;
- mov.u32 %r459, %r395;
+ setp.ge.s32 %p124, %r453, %r9;
+ mov.u32 %r456, %r394;
+ mov.u32 %r457, %r394;
@%p124 bra $L__BB0_183;
- mul.wide.s32 %rd148, %r456, 4;
+ mul.wide.s32 %rd148, %r454, 4;
add.s64 %rd147, %rd42, %rd148;
- ld.volatile.global.v2.s32 {%r459,%r458}, [%rd147];
+ ld.volatile.global.v2.s32 {%r457,%r456}, [%rd147];
$L__BB0_183:
- mov.b32 %f335, %r459;
+ mov.b32 %f335, %r457;
add.f32 %f404, %f404, %f335;
- mov.b32 %f336, %r458;
+ mov.b32 %f336, %r456;
add.f32 %f405, %f405, %f336;
- add.s32 %r456, %r456, %r106;
- add.s32 %r455, %r455, %r2;
- add.s32 %r457, %r457, 1;
- setp.lt.s32 %p125, %r457, %r77;
+ add.s32 %r454, %r454, %r106;
+ add.s32 %r453, %r453, %r3;
+ add.s32 %r455, %r455, 1;
+ setp.lt.s32 %p125, %r455, %r77;
@%p125 bra $L__BB0_180;
$L__BB0_184:
st.shared.f32 [%rd23], %f404;
bar.sync 0;
@@ -1339,36 +1337,36 @@
$L__BB0_186:
bar.sync 0;
@%p105 bra $L__BB0_191;
- mov.u32 %r460, %r461;
+ mov.u32 %r458, %r459;
$L__BB0_188:
- setp.ge.u32 %p128, %r5, %r460;
+ setp.ge.u32 %p128, %r6, %r458;
@%p128 bra $L__BB0_190;
- add.s32 %r409, %r460, %r39;
- mul.wide.s32 %rd149, %r409, 4;
+ add.s32 %r408, %r458, %r39;
+ mul.wide.s32 %rd149, %r408, 4;
add.s64 %rd151, %rd45, %rd149;
ld.shared.f32 %f340, [%rd23];
ld.shared.f32 %f341, [%rd151];
add.f32 %f342, %f341, %f340;
st.shared.f32 [%rd23], %f342;
$L__BB0_190:
bar.sync 0;
- shr.u32 %r118, %r460, 1;
- setp.gt.u32 %p129, %r460, 3;
- mov.u32 %r460, %r118;
+ shr.u32 %r118, %r458, 1;
+ setp.gt.u32 %p129, %r458, 3;
+ mov.u32 %r458, %r118;
@%p129 bra $L__BB0_188;
$L__BB0_191:
mov.f32 %f406, 0f00000000;
@%p108 bra $L__BB0_194;
- setp.lt.u32 %p131, %r2, 2;
+ setp.lt.u32 %p131, %r3, 2;
ld.shared.f32 %f344, [%rd23];
add.f32 %f406, %f344, 0f00000000;
@%p131 bra $L__BB0_194;
ld.shared.f32 %f345, [%rd31];
@@ -1388,35 +1386,35 @@
$L__BB0_196:
bar.sync 0;
@%p105 bra $L__BB0_200;
$L__BB0_197:
- setp.ge.u32 %p134, %r5, %r461;
+ setp.ge.u32 %p134, %r6, %r459;
@%p134 bra $L__BB0_199;
- add.s32 %r410, %r461, %r39;
- mul.wide.s32 %rd152, %r410, 4;
+ add.s32 %r409, %r459, %r39;
+ mul.wide.s32 %rd152, %r409, 4;
add.s64 %rd154, %rd45, %rd152;
ld.shared.f32 %f349, [%rd23];
ld.shared.f32 %f350, [%rd154];
add.f32 %f351, %f350, %f349;
st.shared.f32 [%rd23], %f351;
$L__BB0_199:
bar.sync 0;
- shr.u32 %r120, %r461, 1;
- setp.gt.u32 %p135, %r461, 3;
- mov.u32 %r461, %r120;
+ shr.u32 %r120, %r459, 1;
+ setp.gt.u32 %p135, %r459, 3;
+ mov.u32 %r459, %r120;
@%p135 bra $L__BB0_197;
$L__BB0_200:
add.f32 %f353, %f406, 0f3F800000;
add.f32 %f96, %f353, %f400;
mov.f32 %f407, 0f00000000;
@%p108 bra $L__BB0_203;
- setp.lt.u32 %p137, %r2, 2;
+ setp.lt.u32 %p137, %r3, 2;
ld.shared.f32 %f354, [%rd23];
add.f32 %f407, %f354, 0f00000000;
@%p137 bra $L__BB0_203;
ld.shared.f32 %f355, [%rd31];
@@ -1425,35 +1423,35 @@
$L__BB0_203:
bar.sync 0;
or.pred %p140, %p108, %p119;
@%p140 bra $L__BB0_206;
- shl.b32 %r121, %r7, 1;
- mul.lo.s32 %r411, %r3, %r74;
- shl.b32 %r122, %r411, 1;
- add.s32 %r412, %r121, %r122;
- or.b32 %r413, %r412, 1;
- setp.ge.s32 %p141, %r413, %r176;
+ shl.b32 %r121, %r8, 1;
+ mul.lo.s32 %r410, %r4, %r74;
+ shl.b32 %r122, %r410, 1;
+ add.s32 %r411, %r121, %r122;
+ or.b32 %r412, %r411, 1;
+ setp.ge.s32 %p141, %r412, %r176;
@%p141 bra $L__BB0_206;
ld.param.u64 %rd162, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_9];
ld.param.u64 %rd158, [_ZN11kernelscope6kernelENS_6TensorIfLi2ELi2EEES1_S1_S1_NS0_IfLi1ELi1EEES2_S2_S2_S1_S2_S1_S1_NS0_IxLi1ELi1EEE_param_6];
- add.s32 %r418, %r122, %r121;
- mul.wide.s32 %rd157, %r418, 4;
+ add.s32 %r417, %r122, %r121;
+ mul.wide.s32 %rd157, %r417, 4;
add.s64 %rd155, %rd158, %rd157;
- mov.b32 %r415, %f407;
- mov.b32 %r414, %f406;
-
- st.global.cs.v2.s32 [%rd155], {%r414,%r415};
+ mov.b32 %r414, %f407;
+ mov.b32 %r413, %f406;
+
+ st.global.cs.v2.s32 [%rd155], {%r413,%r414};
add.s64 %rd156, %rd162, %rd157;
add.f32 %f356, %f407, 0f3F800000;
add.f32 %f357, %f356, %f401;
- mov.b32 %r417, %f357;
- mov.b32 %r416, %f96;
-
- st.global.cs.v2.s32 [%rd156], {%r416,%r417};
+ mov.b32 %r416, %f357;
+ mov.b32 %r415, %f96;
+
+ st.global.cs.v2.s32 [%rd156], {%r415,%r416};
$L__BB0_206:
ret;
26: CombinedSchedulerTest.InnerOuterMismatch
Kernel 1
CUDA
PTX
0ddccc60e
Diff
cfa1a2c6b
-7
+7 index type: int
registers: 32
gmem: 3
static smem: 0
stack frame: 0
spill stores: 0
spill loads: 0
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T3, Tensor<float, 1, 1> T4, Tensor<float, 2, 2> T13, Tensor<int64_t, 1, 1> T17) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[2LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T5 = reinterpret_cast<float*>(array + smem_offset + 0);
// Allocate global tensor T13
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T14[i0] = 0.000000000e+00f;
}
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i2 = 0; i2 < (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i2) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T5) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * T0.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T10;
T10.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1]
= T12[i1]
+ T10[i1];
}
} else {
Array<float, 4, 4> T10;
T10.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1]
= T12[i1]
+ T10[i1];
}
}
Array<float, 1, 1> T1;
T1[0] = 0.000000000e+00f;
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T8;
T8.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T11[0]
= T11[0]
+ T8[i3];
}
} else {
Array<float, 4, 4> T8;
T8.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T11[0]
= T11[0]
+ T8[i3];
}
}
blockReduce<true, false, false, true>(T1[0], T11[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T2;
broadcast::blockBroadcast<true, false, false, true>(T2[0], T1[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T9;
T9.set(float(0));
loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T6;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 4; ++i4) {
T6[i4]
= T2[0]
+ T9[i4];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T3[((((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))], &T6[0]);
} else {
Array<float, 4, 4> T9;
T9.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T6;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 4; ++i4) {
T6[i4]
= T2[0]
+ T9[i4];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T3[((((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))], &T6[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
blockReduce<false, true, false, true>(T14[i0], T12[i0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL]))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T13[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)blockIdx.y)))], &T14[0]);
}
}
// Allocate global tensor T17
grid_sync::sync<false, true, false, true, true>(T17[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T16;
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 2; ++i5) {
T16[i5] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i6 = 0; i6 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i6) {
Array<float, 2, 2> T15;
T15.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < T0.logical_size[2LL])) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i6)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T15[0], &*(volatile float*)&T13[((((T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * T0.logical_size[2LL]) * i6))]);
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 2; ++i5) {
T16[i5]
= T16[i5]
+ T15[i5];
}
}
Array<float, 2, 2> T7;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T7[i7] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<true, false, false, true>(T7[i7], T16[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < T0.logical_size[2LL]))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T7[0]);
}
}
__global__ void nvfuser_N(Tensor<float, 3, 3> T0, Tensor<float, 3, 3> T3, Tensor<float, 1, 1> T4, Tensor<float, 2, 2> T13, Tensor<int64_t, 1, 1> T17) {
alignas(16) extern __shared__ char array[];
void* shared_mem = array;
const unsigned smem_offset = alignBufferSize((max((ceilDiv(T0.logical_size[2LL], 4)), ((nvfuser_index_t)blockDim.x))) * ((nvfuser_index_t)blockDim.y) * 1 * sizeof(float), 16);
float* T5 = reinterpret_cast<float*>(array + smem_offset + 0);
// Allocate global tensor T13
Array<float, 4, 4> T14;
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
T14[i0] = 0.000000000e+00f;
}
Array<float, 4, 1> T12;
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i2 = 0; i2 < (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y))); ++i2) {
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
asm volatile(
"{\n"
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
:"r"((uint32_t)(((toSmem(T5) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T10;
T10.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1]
= T12[i1]
+ T10[i1];
}
} else {
Array<float, 4, 4> T10;
T10.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1]
= T12[i1]
+ T10[i1];
}
}
Array<float, 1, 1> T1;
T1[0] = 0.000000000e+00f;
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T8;
T8.set(float(0.000000000e+00f));
loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T11[0]
= T11[0]
+ T8[i3];
}
} else {
Array<float, 4, 4> T8;
T8.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T11[0]
= T11[0]
+ T8[i3];
}
}
blockReduce<true, false, false, true>(T1[0], T11[0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
Array<float, 1, 1> T2;
broadcast::blockBroadcast<true, false, false, true>(T2[0], T1[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T9;
T9.set(float(0));
loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T6;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 4; ++i4) {
T6[i4]
= T2[0]
+ T9[i4];
}
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T3[((((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))], &T6[0]);
} else {
Array<float, 4, 4> T9;
T9.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T6;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 4; ++i4) {
T6[i4]
= T2[0]
+ T9[i4];
}
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T3[((((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))], &T6[0]);
}
}
}
#pragma unroll
for(nvfuser_index_t i0 = 0; i0 < 4; ++i0) {
blockReduce<false, true, false, true>(T14[i0], T12[i0], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if (((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL]))) {
if ((((nvfuser_index_t)threadIdx.y) == 0)) {
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/true>( &*(volatile float*)&T13[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)blockIdx.y)))], &T14[0]);
}
}
// Allocate global tensor T17
grid_sync::sync<false, true, false, true, true>(T17[index_utils::maskedOffset<true, false, true>(blockIdx, gridDim)], index_utils::maskedSize<false, true, false>(gridDim), DefaultBlockDim());
Array<float, 2, 1> T16;
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 2; ++i5) {
T16[i5] = 0.000000000e+00f;
}
#pragma unroll 1
for(nvfuser_index_t i6 = 0; i6 < (ceilDiv(((nvfuser_index_t)gridDim.y), ((nvfuser_index_t)blockDim.x))); ++i6) {
Array<float, 2, 2> T15;
T15.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), ((nvfuser_index_t)blockDim.y)))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < T0.logical_size[2LL])) && ((((nvfuser_index_t)threadIdx.x) + (((nvfuser_index_t)blockDim.x) * i6)) < ((nvfuser_index_t)gridDim.y)))) {
loadGlobalToLocal<float, /*vec_size=*/2, /*is_volatile=*/true, CacheOp::Streaming>(&T15[0], &*(volatile float*)&T13[((((T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.x)) + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.x) * T0.logical_size[2LL]) * i6))]);
}
#pragma unroll
for(nvfuser_index_t i5 = 0; i5 < 2; ++i5) {
T16[i5]
= T16[i5]
+ T15[i5];
}
}
Array<float, 2, 2> T7;
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
T7[i7] = 0.000000000e+00f;
}
#pragma unroll
for(nvfuser_index_t i7 = 0; i7 < 2; ++i7) {
blockReduce<true, false, false, true>(T7[i7], T16[i7], [](float &a, float b) { a = a + b; }, static_cast<float*>(shared_mem), true, true, float(0.000000000e+00f), DefaultBlockDim());
}
if ((((((nvfuser_index_t)threadIdx.x) == 0) && (((nvfuser_index_t)blockIdx.y) < (ceilDiv((ceilDiv(T0.logical_size[2LL], 2)), ((nvfuser_index_t)blockDim.y))))) && (((1 + (2 * ((nvfuser_index_t)threadIdx.y))) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y))) < T0.logical_size[2LL]))) {
loadLocalToGlobal<float, /*vec_size=*/2, /*is_volatile=*/false>( &T4[((2 * ((nvfuser_index_t)threadIdx.y)) + ((2 * ((nvfuser_index_t)blockDim.y)) * ((nvfuser_index_t)blockIdx.y)))], &T7[0]);
}
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -23,32 +23,32 @@
" .reg .pred p0; \n"
" setp.ne.b32 p0, %3, 0;\n"
" cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
"}\n"
:
- :"r"((uint32_t)(((toSmem(T5) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((4 * T0.logical_size[2LL]) * ((nvfuser_index_t)threadIdx.y))))),
+ :"r"((uint32_t)(((toSmem(T5) + (16 * ((nvfuser_index_t)threadIdx.x))) + ((16 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y))))),
"l"(((((T0.data + (4 * ((nvfuser_index_t)threadIdx.x))) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))),
"n"(16LL),
"r"((uint32_t)(false))
);
}
asm volatile("cp.async.wait_all;\n");
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T10;
T10.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1]
= T12[i1]
+ T10[i1];
}
} else {
Array<float, 4, 4> T10;
T10.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
- loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T10[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i1 = 0; i1 < 4; ++i1) {
T12[i1]
= T12[i1]
@@ -60,22 +60,22 @@
Array<float, 1, 1> T11;
T11[0] = 0.000000000e+00f;
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T8;
T8.set(float(0.000000000e+00f));
- loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T11[0]
= T11[0]
+ T8[i3];
}
} else {
Array<float, 4, 4> T8;
T8.set(float(0.000000000e+00f));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
- loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T8[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
#pragma unroll
for(nvfuser_index_t i3 = 0; i3 < 4; ++i3) {
T11[0]
= T11[0]
@@ -86,11 +86,11 @@
Array<float, 1, 1> T2;
broadcast::blockBroadcast<true, false, false, true>(T2[0], T1[0], static_cast<float*>(shared_mem), true, DefaultBlockDim());
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
Array<float, 4, 4> T9;
T9.set(float(0));
- loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
Array<float, 4, 4> T6;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 4; ++i4) {
T6[i4]
= T2[0]
@@ -99,11 +99,11 @@
loadLocalToGlobal<float, /*vec_size=*/4, /*is_volatile=*/false>( &T3[((((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y))) + (((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + ((((nvfuser_index_t)blockDim.y) * T0.logical_size[2LL]) * i2))], &T6[0]);
} else {
Array<float, 4, 4> T9;
T9.set(float(0));
if ((((((nvfuser_index_t)threadIdx.x) < (ceilDiv(T0.logical_size[2LL], 4))) && ((3 + (4 * ((nvfuser_index_t)threadIdx.x))) < T0.logical_size[2LL])) && (((((nvfuser_index_t)threadIdx.y) + ((((nvfuser_index_t)blockDim.y) * (ceilDiv((ceilDiv((T0.logical_size[0LL] * T0.logical_size[1LL]), ((nvfuser_index_t)blockDim.y))), ((nvfuser_index_t)gridDim.y)))) * ((nvfuser_index_t)blockIdx.y))) + (((nvfuser_index_t)blockDim.y) * i2)) < (T0.logical_size[0LL] * T0.logical_size[1LL])))) {
- loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + (T0.logical_size[2LL] * ((nvfuser_index_t)threadIdx.y)))]);
+ loadGeneric<float, 4>( &T9[0], &T5[((4 * ((nvfuser_index_t)threadIdx.x)) + ((4 * (ceilDiv(T0.logical_size[2LL], 4))) * ((nvfuser_index_t)threadIdx.y)))]);
}
Array<float, 4, 4> T6;
#pragma unroll
for(nvfuser_index_t i4 = 0; i4 < 4; ++i4) {
T6[i4]
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_103393std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_103393std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_103393std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_103395arrayE[];
.entry _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0[32],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_1[32],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_2[16],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_4[16]
)
{
.reg .pred %p<84>;
.reg .f32 %f<177>;
.reg .b32 %r<362>;
.reg .b64 %rd<92>;
ld.param.v2.u32 {%r126, %r127}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r128, %r129}, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0+16];
ld.param.u64 %rd24, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd23, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd22, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd21, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd20, [_ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_1033910nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0];
add.s32 %r146, %r128, 3;
shr.s32 %r147, %r146, 31;
shr.u32 %r148, %r147, 30;
add.s32 %r149, %r146, %r148;
shr.s32 %r2, %r149, 2;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r5, %r126, %r127;
add.s32 %r6, %r4, -1;
add.s32 %r150, %r6, %r5;
div.s32 %r151, %r150, %r4;
mov.u32 %r7, %nctaid.y;
add.s32 %r8, %r7, -1;
add.s32 %r152, %r8, %r151;
div.s32 %r9, %r152, %r7;
setp.gt.s32 %p6, %r9, 0;
@%p6 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
shl.b32 %r154, %r4, 2;
max.s32 %r155, %r2, %r3;
mad.lo.s32 %r156, %r154, %r155, 15;
and.b32 %r157, %r156, -16;
cvt.u64.u32 %rd25, %r157;
mov.u64 %rd26, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_103395arrayE;
add.s64 %rd27, %rd26, %rd25;
mov.u32 %r10, %tid.x;
setp.ge.s32 %p7, %r10, %r2;
shl.b32 %r11, %r10, 2;
or.b32 %r158, %r11, 3;
setp.ge.s32 %p8, %r158, %r128;
shl.b32 %r159, %r10, 4;
mov.u32 %r12, %tid.y;
shl.b32 %r160, %r12, 2;
mad.lo.s32 %r13, %r160, %r128, %r159;
cvt.s64.s32 %rd28, %r11;
mul.lo.s32 %r161, %r128, %r12;
cvt.s64.s32 %rd29, %r161;
add.s64 %rd2, %rd29, %rd28;
mov.u32 %r162, %ctaid.y;
mul.lo.s32 %r163, %r162, %r4;
mul.lo.s32 %r164, %r163, %r128;
mul.lo.s32 %r14, %r128, %r4;
mov.u32 %r165, %tid.z;
mad.lo.s32 %r166, %r4, %r165, %r12;
mad.lo.s32 %r15, %r166, %r3, %r10;
mul.wide.u32 %rd30, %r15, 4;
add.s64 %rd3, %rd26, %rd30;
clz.b32 %r167, %r3;
mov.u32 %r168, 31;
sub.s32 %r169, %r168, %r167;
mov.u32 %r170, 1;
shl.b32 %r16, %r170, %r169;
setp.lt.u32 %p9, %r10, %r16;
add.s32 %r171, %r16, %r10;
setp.lt.u32 %p10, %r171, %r3;
and.pred %p1, %p9, %p10;
add.s32 %r172, %r15, %r16;
mul.wide.s32 %rd31, %r172, 4;
add.s64 %rd4, %rd26, %rd31;
shr.u32 %r173, %r16, 31;
add.s32 %r174, %r16, %r173;
shr.s32 %r17, %r174, 1;
add.s32 %r175, %r161, %r11;
mul.wide.s32 %rd32, %r175, 4;
add.s64 %rd5, %rd27, %rd32;
add.s32 %r176, %r15, 1;
mul.wide.u32 %rd33, %r176, 4;
add.s64 %rd6, %rd26, %rd33;
mul.wide.s32 %rd34, %r166, 4;
add.s64 %rd7, %rd26, %rd34;
or.pred %p2, %p7, %p8;
mul.lo.s32 %r18, %r9, %r162;
mul.lo.s32 %r177, %r164, %r9;
cvt.s64.s32 %rd8, %r177;
cvta.shared.u64 %rd35, %rd26;
add.s64 %rd10, %rd35, %rd25;
mov.u32 %r330, 0;
mov.f32 %f163, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd10; cvt.u32.u64 %r180, smem_ptr; }
// end inline asm
add.s32 %r181, %r13, %r180;
not.pred %p16, %p1;
mov.f32 %f164, %f163;
mov.f32 %f165, %f163;
mov.f32 %f166, %f163;
$L__BB0_3:
.pragma "nounroll";
@%p2 bra $L__BB0_6;
add.s32 %r178, %r18, %r330;
mad.lo.s32 %r179, %r178, %r4, %r12;
setp.ge.s32 %p11, %r179, %r5;
@%p11 bra $L__BB0_6;
mul.lo.s32 %r183, %r14, %r330;
cvt.s64.s32 %rd38, %r183;
add.s64 %rd39, %rd2, %rd38;
add.s64 %rd40, %rd39, %rd8;
shl.b64 %rd41, %rd40, 2;
add.s64 %rd37, %rd20, %rd41;
mov.u32 %r182, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r182, 0;
cp.async.ca.shared.global [%r181], [%rd37], 16, p0;
}
// end inline asm
$L__BB0_6:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p2 bra $L__BB0_9;
add.s32 %r184, %r18, %r330;
mad.lo.s32 %r185, %r184, %r4, %r12;
setp.ge.s32 %p12, %r185, %r5;
@%p12 bra $L__BB0_9;
ld.shared.v4.u32 {%r331, %r332, %r333, %r334}, [%rd5];
bra.uni $L__BB0_12;
$L__BB0_9:
mov.u32 %r331, 0;
mov.u32 %r332, %r331;
mov.u32 %r333, %r331;
mov.u32 %r334, %r331;
@%p2 bra $L__BB0_12;
add.s32 %r198, %r18, %r330;
mad.lo.s32 %r199, %r198, %r4, %r12;
setp.ge.s32 %p13, %r199, %r5;
@%p13 bra $L__BB0_12;
ld.shared.v4.u32 {%r331, %r332, %r333, %r334}, [%rd5];
$L__BB0_12:
mov.b32 %f53, %r334;
add.f32 %f166, %f166, %f53;
mov.b32 %f54, %r333;
add.f32 %f165, %f165, %f54;
mov.b32 %f55, %r332;
add.f32 %f164, %f164, %f55;
mov.b32 %f56, %r331;
add.f32 %f163, %f163, %f56;
@%p2 bra $L__BB0_15;
add.s32 %r204, %r18, %r330;
mad.lo.s32 %r205, %r204, %r4, %r12;
setp.ge.s32 %p14, %r205, %r5;
@%p14 bra $L__BB0_15;
ld.shared.v4.f32 {%f57, %f58, %f59, %f60}, [%rd5];
add.f32 %f62, %f57, 0f00000000;
add.f32 %f64, %f62, %f58;
add.f32 %f66, %f64, %f59;
add.f32 %f161, %f66, %f60;
bra.uni $L__BB0_19;
$L__BB0_15:
mov.u32 %r335, 0;
mov.u32 %r336, %r335;
mov.u32 %r337, %r335;
mov.u32 %r338, %r335;
@%p2 bra $L__BB0_18;
add.s32 %r214, %r18, %r330;
mad.lo.s32 %r215, %r214, %r4, %r12;
setp.ge.s32 %p15, %r215, %r5;
@%p15 bra $L__BB0_18;
ld.shared.v4.u32 {%r338, %r337, %r336, %r335}, [%rd5];
$L__BB0_18:
mov.b32 %f68, %r338;
add.f32 %f69, %f68, 0f00000000;
mov.b32 %f70, %r337;
add.f32 %f71, %f69, %f70;
mov.b32 %f72, %r336;
add.f32 %f73, %f71, %f72;
mov.b32 %f74, %r335;
add.f32 %f161, %f73, %f74;
$L__BB0_19:
st.shared.f32 [%rd3], %f161;
bar.sync 0;
@%p16 bra $L__BB0_21;
ld.shared.f32 %f75, [%rd4];
ld.shared.f32 %f76, [%rd3];
add.f32 %f77, %f75, %f76;
st.shared.f32 [%rd3], %f77;
$L__BB0_21:
setp.lt.s32 %p17, %r16, 4;
bar.sync 0;
@%p17 bra $L__BB0_26;
mov.u32 %r339, %r17;
$L__BB0_23:
setp.ge.u32 %p18, %r10, %r339;
@%p18 bra $L__BB0_25;
add.s32 %r220, %r339, %r15;
mul.wide.s32 %rd42, %r220, 4;
add.s64 %rd44, %rd26, %rd42;
ld.shared.f32 %f78, [%rd3];
ld.shared.f32 %f79, [%rd44];
add.f32 %f80, %f79, %f78;
st.shared.f32 [%rd3], %f80;
$L__BB0_25:
bar.sync 0;
shr.u32 %r41, %r339, 1;
setp.gt.u32 %p19, %r339, 3;
mov.u32 %r339, %r41;
@%p19 bra $L__BB0_23;
$L__BB0_26:
setp.ne.s32 %p20, %r10, 0;
mov.f32 %f162, 0f00000000;
@%p20 bra $L__BB0_29;
setp.lt.u32 %p21, %r3, 2;
ld.shared.f32 %f82, [%rd3];
add.f32 %f162, %f82, 0f00000000;
@%p21 bra $L__BB0_29;
ld.shared.f32 %f83, [%rd6];
add.f32 %f162, %f162, %f83;
$L__BB0_29:
bar.sync 0;
@%p20 bra $L__BB0_31;
st.shared.f32 [%rd7], %f162;
$L__BB0_31:
bar.sync 0;
ld.shared.f32 %f15, [%rd7];
bar.sync 0;
@%p2 bra $L__BB0_34;
add.s32 %r221, %r18, %r330;
mad.lo.s32 %r42, %r221, %r4, %r12;
setp.ge.s32 %p23, %r42, %r5;
@%p23 bra $L__BB0_34;
ld.shared.v4.f32 {%f84, %f85, %f86, %f87}, [%rd5];
add.f32 %f89, %f15, %f84;
mov.b32 %r222, %f89;
add.f32 %f91, %f15, %f85;
mov.b32 %r223, %f91;
add.f32 %f93, %f15, %f86;
mov.b32 %r224, %f93;
add.f32 %f95, %f15, %f87;
mov.b32 %r225, %f95;
mad.lo.s32 %r226, %r42, %r128, %r11;
mul.wide.s32 %rd46, %r226, 4;
add.s64 %rd45, %rd21, %rd46;
// begin inline asm
st.global.cs.v4.s32 [%rd45], {%r222,%r223,%r224,%r225};
// end inline asm
bra.uni $L__BB0_40;
$L__BB0_34:
mov.u32 %r340, 0;
mov.u32 %r341, %r340;
mov.u32 %r342, %r340;
mov.u32 %r343, %r340;
@%p2 bra $L__BB0_37;
add.s32 %r235, %r18, %r330;
mad.lo.s32 %r236, %r235, %r4, %r12;
setp.ge.s32 %p24, %r236, %r5;
@%p24 bra $L__BB0_37;
ld.shared.v4.u32 {%r343, %r342, %r341, %r340}, [%rd5];
$L__BB0_37:
mov.b32 %f96, %r343;
add.f32 %f97, %f15, %f96;
mov.b32 %r51, %f97;
mov.b32 %f98, %r342;
add.f32 %f99, %f15, %f98;
mov.b32 %r52, %f99;
mov.b32 %f100, %r341;
add.f32 %f101, %f15, %f100;
mov.b32 %r53, %f101;
mov.b32 %f102, %r340;
add.f32 %f16, %f15, %f102;
@%p2 bra $L__BB0_40;
add.s32 %r241, %r18, %r330;
mad.lo.s32 %r54, %r241, %r4, %r12;
setp.ge.s32 %p25, %r54, %r5;
@%p25 bra $L__BB0_40;
mad.lo.s32 %r246, %r54, %r128, %r11;
mul.wide.s32 %rd48, %r246, 4;
add.s64 %rd47, %rd21, %rd48;
mov.b32 %r245, %f16;
// begin inline asm
st.global.cs.v4.s32 [%rd47], {%r51,%r52,%r53,%r245};
// end inline asm
$L__BB0_40:
add.s32 %r330, %r330, 1;
setp.lt.s32 %p26, %r330, %r9;
@%p26 bra $L__BB0_3;
bra.uni $L__BB0_41;
$L__BB0_1:
mov.f32 %f163, 0f00000000;
mov.f32 %f164, %f163;
mov.f32 %f165, %f163;
mov.f32 %f166, %f163;
$L__BB0_41:
mov.u32 %r56, %tid.x;
mov.u32 %r247, %tid.z;
mov.u32 %r59, %tid.y;
mad.lo.s32 %r248, %r4, %r247, %r59;
mad.lo.s32 %r57, %r248, %r3, %r56;
mul.wide.u32 %rd49, %r57, 4;
mov.u64 %rd50, _ZN58_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_cf748c75_103395arrayE;
add.s64 %rd12, %rd50, %rd49;
clz.b32 %r249, %r4;
mov.u32 %r250, 31;
sub.s32 %r251, %r250, %r249;
mov.u32 %r252, 1;
shl.b32 %r58, %r252, %r251;
setp.lt.u32 %p27, %r59, %r58;
add.s32 %r253, %r58, %r59;
setp.lt.u32 %p28, %r253, %r4;
and.pred %p3, %p27, %p28;
shl.b32 %r254, %r3, %r251;
add.s32 %r255, %r57, %r254;
mul.wide.s32 %rd51, %r255, 4;
add.s64 %rd13, %rd50, %rd51;
shr.u32 %r256, %r58, 31;
add.s32 %r257, %r58, %r256;
shr.s32 %r350, %r257, 1;
st.shared.f32 [%rd12], %f163;
bar.sync 0;
not.pred %p29, %p3;
@%p29 bra $L__BB0_43;
ld.shared.f32 %f103, [%rd13];
ld.shared.f32 %f104, [%rd12];
add.f32 %f105, %f103, %f104;
st.shared.f32 [%rd12], %f105;
$L__BB0_43:
setp.lt.s32 %p30, %r58, 4;
bar.sync 0;
@%p30 bra $L__BB0_48;
mov.u32 %r344, %r350;
$L__BB0_45:
setp.ge.u32 %p31, %r59, %r344;
@%p31 bra $L__BB0_47;
mad.lo.s32 %r258, %r344, %r3, %r57;
mul.wide.s32 %rd52, %r258, 4;
add.s64 %rd54, %rd50, %rd52;
ld.shared.f32 %f106, [%rd12];
ld.shared.f32 %f107, [%rd54];
add.f32 %f108, %f107, %f106;
st.shared.f32 [%rd12], %f108;
$L__BB0_47:
bar.sync 0;
shr.u32 %r62, %r344, 1;
setp.gt.u32 %p32, %r344, 3;
mov.u32 %r344, %r62;
@%p32 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r260, %r57, %r3;
mul.wide.u32 %rd55, %r260, 4;
add.s64 %rd14, %rd50, %rd55;
setp.ne.s32 %p33, %r59, 0;
mov.u32 %r345, 0;
@%p33 bra $L__BB0_52;
setp.lt.u32 %p34, %r4, 2;
ld.shared.f32 %f109, [%rd12];
add.f32 %f167, %f109, 0f00000000;
@%p34 bra $L__BB0_51;
ld.shared.f32 %f110, [%rd14];
add.f32 %f167, %f167, %f110;
$L__BB0_51:
mov.b32 %r345, %f167;
$L__BB0_52:
bar.sync 0;
st.shared.f32 [%rd12], %f164;
bar.sync 0;
@%p29 bra $L__BB0_54;
ld.shared.f32 %f111, [%rd13];
ld.shared.f32 %f112, [%rd12];
add.f32 %f113, %f111, %f112;
st.shared.f32 [%rd12], %f113;
$L__BB0_54:
bar.sync 0;
@%p30 bra $L__BB0_59;
mov.u32 %r346, %r350;
$L__BB0_56:
setp.ge.u32 %p37, %r59, %r346;
@%p37 bra $L__BB0_58;
mad.lo.s32 %r261, %r346, %r3, %r57;
mul.wide.s32 %rd57, %r261, 4;
add.s64 %rd59, %rd50, %rd57;
ld.shared.f32 %f114, [%rd12];
ld.shared.f32 %f115, [%rd59];
add.f32 %f116, %f115, %f114;
st.shared.f32 [%rd12], %f116;
$L__BB0_58:
bar.sync 0;
shr.u32 %r66, %r346, 1;
setp.gt.u32 %p38, %r346, 3;
mov.u32 %r346, %r66;
@%p38 bra $L__BB0_56;
$L__BB0_59:
mov.u32 %r347, 0;
@%p33 bra $L__BB0_63;
setp.lt.u32 %p40, %r4, 2;
ld.shared.f32 %f117, [%rd12];
add.f32 %f168, %f117, 0f00000000;
@%p40 bra $L__BB0_62;
ld.shared.f32 %f118, [%rd14];
add.f32 %f168, %f168, %f118;
$L__BB0_62:
mov.b32 %r347, %f168;
$L__BB0_63:
bar.sync 0;
st.shared.f32 [%rd12], %f165;
bar.sync 0;
@%p29 bra $L__BB0_65;
ld.shared.f32 %f119, [%rd13];
ld.shared.f32 %f120, [%rd12];
add.f32 %f121, %f119, %f120;
st.shared.f32 [%rd12], %f121;
$L__BB0_65:
bar.sync 0;
@%p30 bra $L__BB0_70;
mov.u32 %r348, %r350;
$L__BB0_67:
setp.ge.u32 %p43, %r59, %r348;
@%p43 bra $L__BB0_69;
mad.lo.s32 %r263, %r348, %r3, %r57;
mul.wide.s32 %rd60, %r263, 4;
add.s64 %rd62, %rd50, %rd60;
ld.shared.f32 %f122, [%rd12];
ld.shared.f32 %f123, [%rd62];
add.f32 %f124, %f123, %f122;
st.shared.f32 [%rd12], %f124;
$L__BB0_69:
bar.sync 0;
shr.u32 %r70, %r348, 1;
setp.gt.u32 %p44, %r348, 3;
mov.u32 %r348, %r70;
@%p44 bra $L__BB0_67;
$L__BB0_70:
mov.u32 %r349, 0;
@%p33 bra $L__BB0_74;
setp.lt.u32 %p46, %r4, 2;
ld.shared.f32 %f125, [%rd12];
add.f32 %f169, %f125, 0f00000000;
@%p46 bra $L__BB0_73;
ld.shared.f32 %f126, [%rd14];
add.f32 %f169, %f169, %f126;
$L__BB0_73:
mov.b32 %r349, %f169;
$L__BB0_74:
bar.sync 0;
st.shared.f32 [%rd12], %f166;
bar.sync 0;
@%p29 bra $L__BB0_76;
ld.shared.f32 %f127, [%rd13];
ld.shared.f32 %f128, [%rd12];
add.f32 %f129, %f127, %f128;
st.shared.f32 [%rd12], %f129;
$L__BB0_76:
bar.sync 0;
@%p30 bra $L__BB0_80;
$L__BB0_77:
setp.ge.u32 %p49, %r59, %r350;
@%p49 bra $L__BB0_79;
mad.lo.s32 %r265, %r350, %r3, %r57;
mul.wide.s32 %rd63, %r265, 4;
add.s64 %rd65, %rd50, %rd63;
ld.shared.f32 %f130, [%rd12];
ld.shared.f32 %f131, [%rd65];
add.f32 %f132, %f131, %f130;
st.shared.f32 [%rd12], %f132;
$L__BB0_79:
bar.sync 0;
shr.u32 %r74, %r350, 1;
setp.gt.u32 %p50, %r350, 3;
mov.u32 %r350, %r74;
@%p50 bra $L__BB0_77;
$L__BB0_80:
mov.u32 %r351, 0;
@%p33 bra $L__BB0_84;
setp.lt.u32 %p52, %r4, 2;
ld.shared.f32 %f133, [%rd12];
add.f32 %f170, %f133, 0f00000000;
@%p52 bra $L__BB0_83;
ld.shared.f32 %f134, [%rd14];
add.f32 %f170, %f170, %f134;
$L__BB0_83:
mov.b32 %r351, %f170;
$L__BB0_84:
bar.sync 0;
setp.ge.s32 %p53, %r56, %r2;
@%p53 bra $L__BB0_87;
shl.b32 %r77, %r56, 2;
or.b32 %r267, %r77, 3;
setp.ge.s32 %p55, %r267, %r128;
or.pred %p56, %p33, %p55;
@%p56 bra $L__BB0_87;
mov.u32 %r272, %ctaid.y;
mad.lo.s32 %r273, %r128, %r272, %r77;
mul.wide.s32 %rd67, %r273, 4;
add.s64 %rd66, %rd23, %rd67;
// begin inline asm
st.volatile.global.v4.s32 [%rd66], {%r345,%r347,%r349,%r351};
// end inline asm
$L__BB0_87:
mov.u32 %r78, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r275, %r247, %r56;
or.b32 %r276, %r275, %r59;
setp.ne.s32 %p57, %r276, 0;
@%p57 bra $L__BB0_91;
cvta.to.global.u64 %rd68, %rd24;
mov.u32 %r277, %ctaid.z;
mov.u32 %r278, %nctaid.x;
mov.u32 %r279, %ctaid.x;
mad.lo.s32 %r280, %r277, %r278, %r279;
mul.wide.s32 %rd69, %r280, 8;
add.s64 %rd16, %rd68, %rd69;
setp.eq.s32 %p58, %r78, %r8;
cvt.s64.s32 %rd70, %r7;
mov.u64 %rd71, -9223372036854775807;
sub.s64 %rd72, %rd71, %rd70;
selp.b64 %rd73, %rd72, 1, %p58;
atom.global.add.u64 %rd17, [%rd16], %rd73;
ld.volatile.global.u64 %rd74, [%rd16];
xor.b64 %rd75, %rd74, %rd17;
setp.lt.s64 %p59, %rd75, 0;
@%p59 bra $L__BB0_91;
mov.u32 %r352, 8;
$L__BB0_90:
// begin inline asm
nanosleep.u32 %r352;
// end inline asm
setp.lt.u32 %p60, %r352, 256;
selp.u32 %r283, 1, 0, %p60;
shl.b32 %r352, %r352, %r283;
ld.volatile.global.u64 %rd76, [%rd16];
xor.b64 %rd77, %rd76, %rd17;
setp.gt.s64 %p61, %rd77, -1;
@%p61 bra $L__BB0_90;
$L__BB0_91:
bar.sync 0;
add.s32 %r284, %r8, %r3;
div.s32 %r81, %r284, %r3;
setp.lt.s32 %p62, %r81, 1;
mov.f32 %f173, 0f00000000;
mov.f32 %f174, %f173;
@%p62 bra $L__BB0_97;
add.s32 %r286, %r128, 1;
shr.u32 %r287, %r286, 31;
add.s32 %r288, %r286, %r287;
shr.s32 %r289, %r288, 1;
add.s32 %r290, %r6, %r289;
shl.b32 %r291, %r4, 1;
shl.b32 %r292, %r59, 1;
mad.lo.s32 %r293, %r291, %r78, %r292;
or.b32 %r294, %r293, 1;
setp.ge.s32 %p63, %r294, %r128;
div.s32 %r295, %r290, %r4;
setp.ge.s32 %p64, %r78, %r295;
or.pred %p4, %p64, %p63;
mul.lo.s32 %r296, %r4, %r78;
shl.b32 %r297, %r296, 1;
mad.lo.s32 %r298, %r128, %r56, %r297;
add.s32 %r354, %r298, %r292;
mul.lo.s32 %r83, %r128, %r3;
mov.u32 %r285, 0;
mov.f32 %f173, 0f00000000;
mov.u32 %r353, %r56;
mov.u32 %r355, %r285;
$L__BB0_93:
.pragma "nounroll";
mov.u32 %r356, %r285;
mov.u32 %r357, %r285;
@%p4 bra $L__BB0_96;
setp.ge.s32 %p65, %r353, %r7;
mov.u32 %r356, %r285;
mov.u32 %r357, %r285;
@%p65 bra $L__BB0_96;
mul.wide.s32 %rd79, %r354, 4;
add.s64 %rd78, %rd23, %rd79;
// begin inline asm
ld.volatile.global.v2.s32 {%r357,%r356}, [%rd78];
// end inline asm
$L__BB0_96:
mov.b32 %f139, %r357;
add.f32 %f173, %f173, %f139;
mov.b32 %f140, %r356;
add.f32 %f174, %f174, %f140;
add.s32 %r354, %r354, %r83;
add.s32 %r353, %r353, %r3;
add.s32 %r355, %r355, 1;
setp.lt.s32 %p66, %r355, %r81;
@%p66 bra $L__BB0_93;
$L__BB0_97:
clz.b32 %r305, %r3;
mov.u32 %r306, 31;
sub.s32 %r307, %r306, %r305;
mov.u32 %r308, 1;
shl.b32 %r94, %r308, %r307;
setp.lt.u32 %p67, %r56, %r94;
add.s32 %r309, %r94, %r56;
setp.lt.u32 %p68, %r309, %r3;
and.pred %p5, %p67, %p68;
add.s32 %r310, %r57, %r94;
mul.wide.s32 %rd80, %r310, 4;
add.s64 %rd18, %rd50, %rd80;
shr.u32 %r311, %r94, 31;
add.s32 %r312, %r94, %r311;
shr.s32 %r360, %r312, 1;
st.shared.f32 [%rd12], %f173;
bar.sync 0;
not.pred %p69, %p5;
@%p69 bra $L__BB0_99;
ld.shared.f32 %f141, [%rd18];
ld.shared.f32 %f142, [%rd12];
add.f32 %f143, %f141, %f142;
st.shared.f32 [%rd12], %f143;
$L__BB0_99:
setp.lt.s32 %p70, %r94, 4;
bar.sync 0;
@%p70 bra $L__BB0_104;
mov.u32 %r358, %r360;
$L__BB0_101:
setp.ge.u32 %p71, %r56, %r358;
@%p71 bra $L__BB0_103;
add.s32 %r313, %r358, %r57;
mul.wide.s32 %rd82, %r313, 4;
add.s64 %rd84, %rd50, %rd82;
ld.shared.f32 %f144, [%rd12];
ld.shared.f32 %f145, [%rd84];
add.f32 %f146, %f145, %f144;
st.shared.f32 [%rd12], %f146;
$L__BB0_103:
bar.sync 0;
shr.u32 %r97, %r358, 1;
setp.gt.u32 %p72, %r358, 3;
mov.u32 %r358, %r97;
@%p72 bra $L__BB0_101;
$L__BB0_104:
add.s32 %r315, %r57, 1;
mul.wide.u32 %rd85, %r315, 4;
add.s64 %rd19, %rd50, %rd85;
setp.ne.s32 %p73, %r56, 0;
mov.u32 %r359, 0;
@%p73 bra $L__BB0_108;
setp.lt.u32 %p74, %r3, 2;
ld.shared.f32 %f147, [%rd12];
add.f32 %f175, %f147, 0f00000000;
@%p74 bra $L__BB0_107;
ld.shared.f32 %f148, [%rd19];
add.f32 %f175, %f175, %f148;
$L__BB0_107:
mov.b32 %r359, %f175;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd12], %f174;
bar.sync 0;
@%p69 bra $L__BB0_110;
ld.shared.f32 %f149, [%rd18];
ld.shared.f32 %f150, [%rd12];
add.f32 %f151, %f149, %f150;
st.shared.f32 [%rd12], %f151;
$L__BB0_110:
bar.sync 0;
@%p70 bra $L__BB0_114;
$L__BB0_111:
setp.ge.u32 %p77, %r56, %r360;
@%p77 bra $L__BB0_113;
add.s32 %r316, %r360, %r57;
mul.wide.s32 %rd87, %r316, 4;
add.s64 %rd89, %rd50, %rd87;
ld.shared.f32 %f152, [%rd12];
ld.shared.f32 %f153, [%rd89];
add.f32 %f154, %f153, %f152;
st.shared.f32 [%rd12], %f154;
$L__BB0_113:
bar.sync 0;
shr.u32 %r101, %r360, 1;
setp.gt.u32 %p78, %r360, 3;
mov.u32 %r360, %r101;
@%p78 bra $L__BB0_111;
$L__BB0_114:
mov.u32 %r361, 0;
@%p73 bra $L__BB0_118;
setp.lt.u32 %p80, %r3, 2;
ld.shared.f32 %f155, [%rd12];
add.f32 %f176, %f155, 0f00000000;
@%p80 bra $L__BB0_117;
ld.shared.f32 %f156, [%rd19];
add.f32 %f176, %f176, %f156;
$L__BB0_117:
mov.b32 %r361, %f176;
$L__BB0_118:
bar.sync 0;
@%p73 bra $L__BB0_122;
add.s32 %r318, %r128, 1;
shr.u32 %r319, %r318, 31;
add.s32 %r320, %r318, %r319;
shr.s32 %r321, %r320, 1;
add.s32 %r322, %r6, %r321;
div.s32 %r323, %r322, %r4;
setp.ge.s32 %p82, %r78, %r323;
@%p82 bra $L__BB0_122;
shl.b32 %r104, %r59, 1;
mul.lo.s32 %r324, %r4, %r78;
shl.b32 %r105, %r324, 1;
add.s32 %r325, %r105, %r104;
or.b32 %r326, %r325, 1;
setp.ge.s32 %p83, %r326, %r128;
@%p83 bra $L__BB0_122;
add.s32 %r329, %r104, %r105;
mul.wide.s32 %rd91, %r329, 4;
add.s64 %rd90, %rd22, %rd91;
// begin inline asm
st.global.cs.v2.s32 [%rd90], {%r359,%r361};
// end inline asm
$L__BB0_122:
ret;
}
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-35583870
// Cuda compilation tools, release 12.8, V12.8.93
// Based on NVVM 7.0.1
//
.version 8.7
.target sm_90a
.address_size 64
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_72333std17integral_constantIbLb0EE5valueE;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_72333std17integral_constantIbLb1EE5valueE = 1;
.global .align 1 .u8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_72333std14__numeric_typeIvE5valueE = 1;
.extern .shared .align 16 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_72335arrayE[];
.entry _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE(
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0[32],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_1[32],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_2[16],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_3[24],
.param .align 8 .b8 _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_4[16]
)
{
.reg .pred %p<84>;
.reg .f32 %f<177>;
.reg .b32 %r<362>;
.reg .b64 %rd<92>;
ld.param.v2.u32 {%r126, %r127}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0+8];
ld.param.v2.u32 {%r128, %r129}, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0+16];
ld.param.u64 %rd24, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_4];
ld.param.u64 %rd23, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_3];
ld.param.u64 %rd22, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_2];
ld.param.u64 %rd21, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_1];
ld.param.u64 %rd20, [_ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_723310nvfuser_66ENS_6TensorIfLi3ELi3EEES1_NS0_IfLi1ELi1EEENS0_IfLi2ELi2EEENS0_IxLi1ELi1EEE_param_0];
add.s32 %r146, %r128, 3;
shr.s32 %r147, %r146, 31;
shr.u32 %r148, %r147, 30;
add.s32 %r149, %r146, %r148;
shr.s32 %r2, %r149, 2;
mov.u32 %r3, %ntid.x;
mov.u32 %r4, %ntid.y;
mul.lo.s32 %r5, %r126, %r127;
add.s32 %r6, %r4, -1;
add.s32 %r150, %r6, %r5;
div.s32 %r151, %r150, %r4;
mov.u32 %r7, %nctaid.y;
add.s32 %r8, %r7, -1;
add.s32 %r152, %r8, %r151;
div.s32 %r9, %r152, %r7;
setp.gt.s32 %p6, %r9, 0;
@%p6 bra $L__BB0_2;
bra.uni $L__BB0_1;
$L__BB0_2:
shl.b32 %r154, %r4, 2;
max.s32 %r155, %r2, %r3;
mad.lo.s32 %r156, %r154, %r155, 15;
and.b32 %r157, %r156, -16;
cvt.u64.u32 %rd25, %r157;
mov.u64 %rd26, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_72335arrayE;
add.s64 %rd27, %rd26, %rd25;
mov.u32 %r10, %tid.x;
setp.ge.s32 %p7, %r10, %r2;
shl.b32 %r11, %r10, 2;
or.b32 %r158, %r11, 3;
setp.ge.s32 %p8, %r158, %r128;
mov.u32 %r12, %tid.y;
mad.lo.s32 %r159, %r2, %r12, %r10;
shl.b32 %r13, %r159, 4;
cvt.s64.s32 %rd28, %r11;
mul.lo.s32 %r160, %r128, %r12;
cvt.s64.s32 %rd29, %r160;
add.s64 %rd2, %rd29, %rd28;
mov.u32 %r161, %ctaid.y;
mul.lo.s32 %r162, %r161, %r4;
mul.lo.s32 %r163, %r162, %r128;
mul.lo.s32 %r14, %r128, %r4;
mov.u32 %r164, %tid.z;
mad.lo.s32 %r165, %r4, %r164, %r12;
mad.lo.s32 %r15, %r165, %r3, %r10;
mul.wide.u32 %rd30, %r15, 4;
add.s64 %rd3, %rd26, %rd30;
clz.b32 %r166, %r3;
mov.u32 %r167, 31;
sub.s32 %r168, %r167, %r166;
mov.u32 %r169, 1;
shl.b32 %r16, %r169, %r168;
setp.lt.u32 %p9, %r10, %r16;
add.s32 %r170, %r16, %r10;
setp.lt.u32 %p10, %r170, %r3;
and.pred %p1, %p9, %p10;
add.s32 %r171, %r15, %r16;
mul.wide.s32 %rd31, %r171, 4;
add.s64 %rd4, %rd26, %rd31;
shr.u32 %r172, %r16, 31;
add.s32 %r173, %r16, %r172;
shr.s32 %r17, %r173, 1;
shl.b32 %r174, %r12, 2;
mad.lo.s32 %r175, %r174, %r2, %r11;
mul.wide.s32 %rd32, %r175, 4;
add.s64 %rd5, %rd27, %rd32;
add.s32 %r176, %r15, 1;
mul.wide.u32 %rd33, %r176, 4;
add.s64 %rd6, %rd26, %rd33;
mul.wide.s32 %rd34, %r165, 4;
add.s64 %rd7, %rd26, %rd34;
or.pred %p2, %p7, %p8;
mul.lo.s32 %r18, %r9, %r161;
mul.lo.s32 %r177, %r163, %r9;
cvt.s64.s32 %rd8, %r177;
cvta.shared.u64 %rd35, %rd26;
add.s64 %rd10, %rd35, %rd25;
mov.u32 %r330, 0;
mov.f32 %f163, 0f00000000;
// begin inline asm
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd10; cvt.u32.u64 %r180, smem_ptr; }
// end inline asm
add.s32 %r181, %r180, %r13;
not.pred %p16, %p1;
mov.f32 %f164, %f163;
mov.f32 %f165, %f163;
mov.f32 %f166, %f163;
$L__BB0_3:
.pragma "nounroll";
@%p2 bra $L__BB0_6;
add.s32 %r178, %r18, %r330;
mad.lo.s32 %r179, %r178, %r4, %r12;
setp.ge.s32 %p11, %r179, %r5;
@%p11 bra $L__BB0_6;
mul.lo.s32 %r183, %r14, %r330;
cvt.s64.s32 %rd38, %r183;
add.s64 %rd39, %rd2, %rd38;
add.s64 %rd40, %rd39, %rd8;
shl.b64 %rd41, %rd40, 2;
add.s64 %rd37, %rd20, %rd41;
mov.u32 %r182, 0;
// begin inline asm
{
.reg .pred p0;
setp.ne.b32 p0, %r182, 0;
cp.async.ca.shared.global [%r181], [%rd37], 16, p0;
}
// end inline asm
$L__BB0_6:
// begin inline asm
cp.async.wait_all;
// end inline asm
@%p2 bra $L__BB0_9;
add.s32 %r184, %r18, %r330;
mad.lo.s32 %r185, %r184, %r4, %r12;
setp.ge.s32 %p12, %r185, %r5;
@%p12 bra $L__BB0_9;
ld.shared.v4.u32 {%r331, %r332, %r333, %r334}, [%rd5];
bra.uni $L__BB0_12;
$L__BB0_9:
mov.u32 %r331, 0;
mov.u32 %r332, %r331;
mov.u32 %r333, %r331;
mov.u32 %r334, %r331;
@%p2 bra $L__BB0_12;
add.s32 %r198, %r18, %r330;
mad.lo.s32 %r199, %r198, %r4, %r12;
setp.ge.s32 %p13, %r199, %r5;
@%p13 bra $L__BB0_12;
ld.shared.v4.u32 {%r331, %r332, %r333, %r334}, [%rd5];
$L__BB0_12:
mov.b32 %f53, %r334;
add.f32 %f166, %f166, %f53;
mov.b32 %f54, %r333;
add.f32 %f165, %f165, %f54;
mov.b32 %f55, %r332;
add.f32 %f164, %f164, %f55;
mov.b32 %f56, %r331;
add.f32 %f163, %f163, %f56;
@%p2 bra $L__BB0_15;
add.s32 %r204, %r18, %r330;
mad.lo.s32 %r205, %r204, %r4, %r12;
setp.ge.s32 %p14, %r205, %r5;
@%p14 bra $L__BB0_15;
ld.shared.v4.f32 {%f57, %f58, %f59, %f60}, [%rd5];
add.f32 %f62, %f57, 0f00000000;
add.f32 %f64, %f62, %f58;
add.f32 %f66, %f64, %f59;
add.f32 %f161, %f66, %f60;
bra.uni $L__BB0_19;
$L__BB0_15:
mov.u32 %r335, 0;
mov.u32 %r336, %r335;
mov.u32 %r337, %r335;
mov.u32 %r338, %r335;
@%p2 bra $L__BB0_18;
add.s32 %r214, %r18, %r330;
mad.lo.s32 %r215, %r214, %r4, %r12;
setp.ge.s32 %p15, %r215, %r5;
@%p15 bra $L__BB0_18;
ld.shared.v4.u32 {%r338, %r337, %r336, %r335}, [%rd5];
$L__BB0_18:
mov.b32 %f68, %r338;
add.f32 %f69, %f68, 0f00000000;
mov.b32 %f70, %r337;
add.f32 %f71, %f69, %f70;
mov.b32 %f72, %r336;
add.f32 %f73, %f71, %f72;
mov.b32 %f74, %r335;
add.f32 %f161, %f73, %f74;
$L__BB0_19:
st.shared.f32 [%rd3], %f161;
bar.sync 0;
@%p16 bra $L__BB0_21;
ld.shared.f32 %f75, [%rd4];
ld.shared.f32 %f76, [%rd3];
add.f32 %f77, %f75, %f76;
st.shared.f32 [%rd3], %f77;
$L__BB0_21:
setp.lt.s32 %p17, %r16, 4;
bar.sync 0;
@%p17 bra $L__BB0_26;
mov.u32 %r339, %r17;
$L__BB0_23:
setp.ge.u32 %p18, %r10, %r339;
@%p18 bra $L__BB0_25;
add.s32 %r220, %r339, %r15;
mul.wide.s32 %rd42, %r220, 4;
add.s64 %rd44, %rd26, %rd42;
ld.shared.f32 %f78, [%rd3];
ld.shared.f32 %f79, [%rd44];
add.f32 %f80, %f79, %f78;
st.shared.f32 [%rd3], %f80;
$L__BB0_25:
bar.sync 0;
shr.u32 %r41, %r339, 1;
setp.gt.u32 %p19, %r339, 3;
mov.u32 %r339, %r41;
@%p19 bra $L__BB0_23;
$L__BB0_26:
setp.ne.s32 %p20, %r10, 0;
mov.f32 %f162, 0f00000000;
@%p20 bra $L__BB0_29;
setp.lt.u32 %p21, %r3, 2;
ld.shared.f32 %f82, [%rd3];
add.f32 %f162, %f82, 0f00000000;
@%p21 bra $L__BB0_29;
ld.shared.f32 %f83, [%rd6];
add.f32 %f162, %f162, %f83;
$L__BB0_29:
bar.sync 0;
@%p20 bra $L__BB0_31;
st.shared.f32 [%rd7], %f162;
$L__BB0_31:
bar.sync 0;
ld.shared.f32 %f15, [%rd7];
bar.sync 0;
@%p2 bra $L__BB0_34;
add.s32 %r221, %r18, %r330;
mad.lo.s32 %r42, %r221, %r4, %r12;
setp.ge.s32 %p23, %r42, %r5;
@%p23 bra $L__BB0_34;
ld.shared.v4.f32 {%f84, %f85, %f86, %f87}, [%rd5];
add.f32 %f89, %f15, %f84;
mov.b32 %r222, %f89;
add.f32 %f91, %f15, %f85;
mov.b32 %r223, %f91;
add.f32 %f93, %f15, %f86;
mov.b32 %r224, %f93;
add.f32 %f95, %f15, %f87;
mov.b32 %r225, %f95;
mad.lo.s32 %r226, %r42, %r128, %r11;
mul.wide.s32 %rd46, %r226, 4;
add.s64 %rd45, %rd21, %rd46;
// begin inline asm
st.global.cs.v4.s32 [%rd45], {%r222,%r223,%r224,%r225};
// end inline asm
bra.uni $L__BB0_40;
$L__BB0_34:
mov.u32 %r340, 0;
mov.u32 %r341, %r340;
mov.u32 %r342, %r340;
mov.u32 %r343, %r340;
@%p2 bra $L__BB0_37;
add.s32 %r235, %r18, %r330;
mad.lo.s32 %r236, %r235, %r4, %r12;
setp.ge.s32 %p24, %r236, %r5;
@%p24 bra $L__BB0_37;
ld.shared.v4.u32 {%r343, %r342, %r341, %r340}, [%rd5];
$L__BB0_37:
mov.b32 %f96, %r343;
add.f32 %f97, %f15, %f96;
mov.b32 %r51, %f97;
mov.b32 %f98, %r342;
add.f32 %f99, %f15, %f98;
mov.b32 %r52, %f99;
mov.b32 %f100, %r341;
add.f32 %f101, %f15, %f100;
mov.b32 %r53, %f101;
mov.b32 %f102, %r340;
add.f32 %f16, %f15, %f102;
@%p2 bra $L__BB0_40;
add.s32 %r241, %r18, %r330;
mad.lo.s32 %r54, %r241, %r4, %r12;
setp.ge.s32 %p25, %r54, %r5;
@%p25 bra $L__BB0_40;
mad.lo.s32 %r246, %r54, %r128, %r11;
mul.wide.s32 %rd48, %r246, 4;
add.s64 %rd47, %rd21, %rd48;
mov.b32 %r245, %f16;
// begin inline asm
st.global.cs.v4.s32 [%rd47], {%r51,%r52,%r53,%r245};
// end inline asm
$L__BB0_40:
add.s32 %r330, %r330, 1;
setp.lt.s32 %p26, %r330, %r9;
@%p26 bra $L__BB0_3;
bra.uni $L__BB0_41;
$L__BB0_1:
mov.f32 %f163, 0f00000000;
mov.f32 %f164, %f163;
mov.f32 %f165, %f163;
mov.f32 %f166, %f163;
$L__BB0_41:
mov.u32 %r56, %tid.x;
mov.u32 %r247, %tid.z;
mov.u32 %r59, %tid.y;
mad.lo.s32 %r248, %r4, %r247, %r59;
mad.lo.s32 %r57, %r248, %r3, %r56;
mul.wide.u32 %rd49, %r57, 4;
mov.u64 %rd50, _ZN57_GLOBAL__N__00000000_19___tmp_nvfuser_66_cu_37f2fec9_72335arrayE;
add.s64 %rd12, %rd50, %rd49;
clz.b32 %r249, %r4;
mov.u32 %r250, 31;
sub.s32 %r251, %r250, %r249;
mov.u32 %r252, 1;
shl.b32 %r58, %r252, %r251;
setp.lt.u32 %p27, %r59, %r58;
add.s32 %r253, %r58, %r59;
setp.lt.u32 %p28, %r253, %r4;
and.pred %p3, %p27, %p28;
shl.b32 %r254, %r3, %r251;
add.s32 %r255, %r57, %r254;
mul.wide.s32 %rd51, %r255, 4;
add.s64 %rd13, %rd50, %rd51;
shr.u32 %r256, %r58, 31;
add.s32 %r257, %r58, %r256;
shr.s32 %r350, %r257, 1;
st.shared.f32 [%rd12], %f163;
bar.sync 0;
not.pred %p29, %p3;
@%p29 bra $L__BB0_43;
ld.shared.f32 %f103, [%rd13];
ld.shared.f32 %f104, [%rd12];
add.f32 %f105, %f103, %f104;
st.shared.f32 [%rd12], %f105;
$L__BB0_43:
setp.lt.s32 %p30, %r58, 4;
bar.sync 0;
@%p30 bra $L__BB0_48;
mov.u32 %r344, %r350;
$L__BB0_45:
setp.ge.u32 %p31, %r59, %r344;
@%p31 bra $L__BB0_47;
mad.lo.s32 %r258, %r344, %r3, %r57;
mul.wide.s32 %rd52, %r258, 4;
add.s64 %rd54, %rd50, %rd52;
ld.shared.f32 %f106, [%rd12];
ld.shared.f32 %f107, [%rd54];
add.f32 %f108, %f107, %f106;
st.shared.f32 [%rd12], %f108;
$L__BB0_47:
bar.sync 0;
shr.u32 %r62, %r344, 1;
setp.gt.u32 %p32, %r344, 3;
mov.u32 %r344, %r62;
@%p32 bra $L__BB0_45;
$L__BB0_48:
add.s32 %r260, %r57, %r3;
mul.wide.u32 %rd55, %r260, 4;
add.s64 %rd14, %rd50, %rd55;
setp.ne.s32 %p33, %r59, 0;
mov.u32 %r345, 0;
@%p33 bra $L__BB0_52;
setp.lt.u32 %p34, %r4, 2;
ld.shared.f32 %f109, [%rd12];
add.f32 %f167, %f109, 0f00000000;
@%p34 bra $L__BB0_51;
ld.shared.f32 %f110, [%rd14];
add.f32 %f167, %f167, %f110;
$L__BB0_51:
mov.b32 %r345, %f167;
$L__BB0_52:
bar.sync 0;
st.shared.f32 [%rd12], %f164;
bar.sync 0;
@%p29 bra $L__BB0_54;
ld.shared.f32 %f111, [%rd13];
ld.shared.f32 %f112, [%rd12];
add.f32 %f113, %f111, %f112;
st.shared.f32 [%rd12], %f113;
$L__BB0_54:
bar.sync 0;
@%p30 bra $L__BB0_59;
mov.u32 %r346, %r350;
$L__BB0_56:
setp.ge.u32 %p37, %r59, %r346;
@%p37 bra $L__BB0_58;
mad.lo.s32 %r261, %r346, %r3, %r57;
mul.wide.s32 %rd57, %r261, 4;
add.s64 %rd59, %rd50, %rd57;
ld.shared.f32 %f114, [%rd12];
ld.shared.f32 %f115, [%rd59];
add.f32 %f116, %f115, %f114;
st.shared.f32 [%rd12], %f116;
$L__BB0_58:
bar.sync 0;
shr.u32 %r66, %r346, 1;
setp.gt.u32 %p38, %r346, 3;
mov.u32 %r346, %r66;
@%p38 bra $L__BB0_56;
$L__BB0_59:
mov.u32 %r347, 0;
@%p33 bra $L__BB0_63;
setp.lt.u32 %p40, %r4, 2;
ld.shared.f32 %f117, [%rd12];
add.f32 %f168, %f117, 0f00000000;
@%p40 bra $L__BB0_62;
ld.shared.f32 %f118, [%rd14];
add.f32 %f168, %f168, %f118;
$L__BB0_62:
mov.b32 %r347, %f168;
$L__BB0_63:
bar.sync 0;
st.shared.f32 [%rd12], %f165;
bar.sync 0;
@%p29 bra $L__BB0_65;
ld.shared.f32 %f119, [%rd13];
ld.shared.f32 %f120, [%rd12];
add.f32 %f121, %f119, %f120;
st.shared.f32 [%rd12], %f121;
$L__BB0_65:
bar.sync 0;
@%p30 bra $L__BB0_70;
mov.u32 %r348, %r350;
$L__BB0_67:
setp.ge.u32 %p43, %r59, %r348;
@%p43 bra $L__BB0_69;
mad.lo.s32 %r263, %r348, %r3, %r57;
mul.wide.s32 %rd60, %r263, 4;
add.s64 %rd62, %rd50, %rd60;
ld.shared.f32 %f122, [%rd12];
ld.shared.f32 %f123, [%rd62];
add.f32 %f124, %f123, %f122;
st.shared.f32 [%rd12], %f124;
$L__BB0_69:
bar.sync 0;
shr.u32 %r70, %r348, 1;
setp.gt.u32 %p44, %r348, 3;
mov.u32 %r348, %r70;
@%p44 bra $L__BB0_67;
$L__BB0_70:
mov.u32 %r349, 0;
@%p33 bra $L__BB0_74;
setp.lt.u32 %p46, %r4, 2;
ld.shared.f32 %f125, [%rd12];
add.f32 %f169, %f125, 0f00000000;
@%p46 bra $L__BB0_73;
ld.shared.f32 %f126, [%rd14];
add.f32 %f169, %f169, %f126;
$L__BB0_73:
mov.b32 %r349, %f169;
$L__BB0_74:
bar.sync 0;
st.shared.f32 [%rd12], %f166;
bar.sync 0;
@%p29 bra $L__BB0_76;
ld.shared.f32 %f127, [%rd13];
ld.shared.f32 %f128, [%rd12];
add.f32 %f129, %f127, %f128;
st.shared.f32 [%rd12], %f129;
$L__BB0_76:
bar.sync 0;
@%p30 bra $L__BB0_80;
$L__BB0_77:
setp.ge.u32 %p49, %r59, %r350;
@%p49 bra $L__BB0_79;
mad.lo.s32 %r265, %r350, %r3, %r57;
mul.wide.s32 %rd63, %r265, 4;
add.s64 %rd65, %rd50, %rd63;
ld.shared.f32 %f130, [%rd12];
ld.shared.f32 %f131, [%rd65];
add.f32 %f132, %f131, %f130;
st.shared.f32 [%rd12], %f132;
$L__BB0_79:
bar.sync 0;
shr.u32 %r74, %r350, 1;
setp.gt.u32 %p50, %r350, 3;
mov.u32 %r350, %r74;
@%p50 bra $L__BB0_77;
$L__BB0_80:
mov.u32 %r351, 0;
@%p33 bra $L__BB0_84;
setp.lt.u32 %p52, %r4, 2;
ld.shared.f32 %f133, [%rd12];
add.f32 %f170, %f133, 0f00000000;
@%p52 bra $L__BB0_83;
ld.shared.f32 %f134, [%rd14];
add.f32 %f170, %f170, %f134;
$L__BB0_83:
mov.b32 %r351, %f170;
$L__BB0_84:
bar.sync 0;
setp.ge.s32 %p53, %r56, %r2;
@%p53 bra $L__BB0_87;
shl.b32 %r77, %r56, 2;
or.b32 %r267, %r77, 3;
setp.ge.s32 %p55, %r267, %r128;
or.pred %p56, %p33, %p55;
@%p56 bra $L__BB0_87;
mov.u32 %r272, %ctaid.y;
mad.lo.s32 %r273, %r128, %r272, %r77;
mul.wide.s32 %rd67, %r273, 4;
add.s64 %rd66, %rd23, %rd67;
// begin inline asm
st.volatile.global.v4.s32 [%rd66], {%r345,%r347,%r349,%r351};
// end inline asm
$L__BB0_87:
mov.u32 %r78, %ctaid.y;
membar.gl;
bar.sync 0;
or.b32 %r275, %r247, %r56;
or.b32 %r276, %r275, %r59;
setp.ne.s32 %p57, %r276, 0;
@%p57 bra $L__BB0_91;
cvta.to.global.u64 %rd68, %rd24;
mov.u32 %r277, %ctaid.z;
mov.u32 %r278, %nctaid.x;
mov.u32 %r279, %ctaid.x;
mad.lo.s32 %r280, %r277, %r278, %r279;
mul.wide.s32 %rd69, %r280, 8;
add.s64 %rd16, %rd68, %rd69;
setp.eq.s32 %p58, %r78, %r8;
cvt.s64.s32 %rd70, %r7;
mov.u64 %rd71, -9223372036854775807;
sub.s64 %rd72, %rd71, %rd70;
selp.b64 %rd73, %rd72, 1, %p58;
atom.global.add.u64 %rd17, [%rd16], %rd73;
ld.volatile.global.u64 %rd74, [%rd16];
xor.b64 %rd75, %rd74, %rd17;
setp.lt.s64 %p59, %rd75, 0;
@%p59 bra $L__BB0_91;
mov.u32 %r352, 8;
$L__BB0_90:
// begin inline asm
nanosleep.u32 %r352;
// end inline asm
setp.lt.u32 %p60, %r352, 256;
selp.u32 %r283, 1, 0, %p60;
shl.b32 %r352, %r352, %r283;
ld.volatile.global.u64 %rd76, [%rd16];
xor.b64 %rd77, %rd76, %rd17;
setp.gt.s64 %p61, %rd77, -1;
@%p61 bra $L__BB0_90;
$L__BB0_91:
bar.sync 0;
add.s32 %r284, %r8, %r3;
div.s32 %r81, %r284, %r3;
setp.lt.s32 %p62, %r81, 1;
mov.f32 %f173, 0f00000000;
mov.f32 %f174, %f173;
@%p62 bra $L__BB0_97;
add.s32 %r286, %r128, 1;
shr.u32 %r287, %r286, 31;
add.s32 %r288, %r286, %r287;
shr.s32 %r289, %r288, 1;
add.s32 %r290, %r6, %r289;
shl.b32 %r291, %r4, 1;
shl.b32 %r292, %r59, 1;
mad.lo.s32 %r293, %r291, %r78, %r292;
or.b32 %r294, %r293, 1;
setp.ge.s32 %p63, %r294, %r128;
div.s32 %r295, %r290, %r4;
setp.ge.s32 %p64, %r78, %r295;
or.pred %p4, %p64, %p63;
mul.lo.s32 %r296, %r4, %r78;
shl.b32 %r297, %r296, 1;
mad.lo.s32 %r298, %r128, %r56, %r297;
add.s32 %r354, %r298, %r292;
mul.lo.s32 %r83, %r128, %r3;
mov.u32 %r285, 0;
mov.f32 %f173, 0f00000000;
mov.u32 %r353, %r56;
mov.u32 %r355, %r285;
$L__BB0_93:
.pragma "nounroll";
mov.u32 %r356, %r285;
mov.u32 %r357, %r285;
@%p4 bra $L__BB0_96;
setp.ge.s32 %p65, %r353, %r7;
mov.u32 %r356, %r285;
mov.u32 %r357, %r285;
@%p65 bra $L__BB0_96;
mul.wide.s32 %rd79, %r354, 4;
add.s64 %rd78, %rd23, %rd79;
// begin inline asm
ld.volatile.global.v2.s32 {%r357,%r356}, [%rd78];
// end inline asm
$L__BB0_96:
mov.b32 %f139, %r357;
add.f32 %f173, %f173, %f139;
mov.b32 %f140, %r356;
add.f32 %f174, %f174, %f140;
add.s32 %r354, %r354, %r83;
add.s32 %r353, %r353, %r3;
add.s32 %r355, %r355, 1;
setp.lt.s32 %p66, %r355, %r81;
@%p66 bra $L__BB0_93;
$L__BB0_97:
clz.b32 %r305, %r3;
mov.u32 %r306, 31;
sub.s32 %r307, %r306, %r305;
mov.u32 %r308, 1;
shl.b32 %r94, %r308, %r307;
setp.lt.u32 %p67, %r56, %r94;
add.s32 %r309, %r94, %r56;
setp.lt.u32 %p68, %r309, %r3;
and.pred %p5, %p67, %p68;
add.s32 %r310, %r57, %r94;
mul.wide.s32 %rd80, %r310, 4;
add.s64 %rd18, %rd50, %rd80;
shr.u32 %r311, %r94, 31;
add.s32 %r312, %r94, %r311;
shr.s32 %r360, %r312, 1;
st.shared.f32 [%rd12], %f173;
bar.sync 0;
not.pred %p69, %p5;
@%p69 bra $L__BB0_99;
ld.shared.f32 %f141, [%rd18];
ld.shared.f32 %f142, [%rd12];
add.f32 %f143, %f141, %f142;
st.shared.f32 [%rd12], %f143;
$L__BB0_99:
setp.lt.s32 %p70, %r94, 4;
bar.sync 0;
@%p70 bra $L__BB0_104;
mov.u32 %r358, %r360;
$L__BB0_101:
setp.ge.u32 %p71, %r56, %r358;
@%p71 bra $L__BB0_103;
add.s32 %r313, %r358, %r57;
mul.wide.s32 %rd82, %r313, 4;
add.s64 %rd84, %rd50, %rd82;
ld.shared.f32 %f144, [%rd12];
ld.shared.f32 %f145, [%rd84];
add.f32 %f146, %f145, %f144;
st.shared.f32 [%rd12], %f146;
$L__BB0_103:
bar.sync 0;
shr.u32 %r97, %r358, 1;
setp.gt.u32 %p72, %r358, 3;
mov.u32 %r358, %r97;
@%p72 bra $L__BB0_101;
$L__BB0_104:
add.s32 %r315, %r57, 1;
mul.wide.u32 %rd85, %r315, 4;
add.s64 %rd19, %rd50, %rd85;
setp.ne.s32 %p73, %r56, 0;
mov.u32 %r359, 0;
@%p73 bra $L__BB0_108;
setp.lt.u32 %p74, %r3, 2;
ld.shared.f32 %f147, [%rd12];
add.f32 %f175, %f147, 0f00000000;
@%p74 bra $L__BB0_107;
ld.shared.f32 %f148, [%rd19];
add.f32 %f175, %f175, %f148;
$L__BB0_107:
mov.b32 %r359, %f175;
$L__BB0_108:
bar.sync 0;
st.shared.f32 [%rd12], %f174;
bar.sync 0;
@%p69 bra $L__BB0_110;
ld.shared.f32 %f149, [%rd18];
ld.shared.f32 %f150, [%rd12];
add.f32 %f151, %f149, %f150;
st.shared.f32 [%rd12], %f151;
$L__BB0_110:
bar.sync 0;
@%p70 bra $L__BB0_114;
$L__BB0_111:
setp.ge.u32 %p77, %r56, %r360;
@%p77 bra $L__BB0_113;
add.s32 %r316, %r360, %r57;
mul.wide.s32 %rd87, %r316, 4;
add.s64 %rd89, %rd50, %rd87;
ld.shared.f32 %f152, [%rd12];
ld.shared.f32 %f153, [%rd89];
add.f32 %f154, %f153, %f152;
st.shared.f32 [%rd12], %f154;
$L__BB0_113:
bar.sync 0;
shr.u32 %r101, %r360, 1;
setp.gt.u32 %p78, %r360, 3;
mov.u32 %r360, %r101;
@%p78 bra $L__BB0_111;
$L__BB0_114:
mov.u32 %r361, 0;
@%p73 bra $L__BB0_118;
setp.lt.u32 %p80, %r3, 2;
ld.shared.f32 %f155, [%rd12];
add.f32 %f176, %f155, 0f00000000;
@%p80 bra $L__BB0_117;
ld.shared.f32 %f156, [%rd19];
add.f32 %f176, %f176, %f156;
$L__BB0_117:
mov.b32 %r361, %f176;
$L__BB0_118:
bar.sync 0;
@%p73 bra $L__BB0_122;
add.s32 %r318, %r128, 1;
shr.u32 %r319, %r318, 31;
add.s32 %r320, %r318, %r319;
shr.s32 %r321, %r320, 1;
add.s32 %r322, %r6, %r321;
div.s32 %r323, %r322, %r4;
setp.ge.s32 %p82, %r78, %r323;
@%p82 bra $L__BB0_122;
shl.b32 %r104, %r59, 1;
mul.lo.s32 %r324, %r4, %r78;
shl.b32 %r105, %r324, 1;
add.s32 %r325, %r105, %r104;
or.b32 %r326, %r325, 1;
setp.ge.s32 %p83, %r326, %r128;
@%p83 bra $L__BB0_122;
add.s32 %r329, %r104, %r105;
mul.wide.s32 %rd91, %r329, 4;
add.s64 %rd90, %rd22, %rd91;
// begin inline asm
st.global.cs.v2.s32 [%rd90], {%r359,%r361};
// end inline asm
$L__BB0_122:
ret;
}
--- 0ddccc60e
+++ cfa1a2c6b
@@ -66,63 +66,63 @@
mov.u32 %r10, %tid.x;
setp.ge.s32 %p7, %r10, %r2;
shl.b32 %r11, %r10, 2;
or.b32 %r158, %r11, 3;
setp.ge.s32 %p8, %r158, %r128;
- shl.b32 %r159, %r10, 4;
mov.u32 %r12, %tid.y;
- shl.b32 %r160, %r12, 2;
- mad.lo.s32 %r13, %r160, %r128, %r159;
+ mad.lo.s32 %r159, %r2, %r12, %r10;
+ shl.b32 %r13, %r159, 4;
cvt.s64.s32 %rd28, %r11;
- mul.lo.s32 %r161, %r128, %r12;
- cvt.s64.s32 %rd29, %r161;
+ mul.lo.s32 %r160, %r128, %r12;
+ cvt.s64.s32 %rd29, %r160;
add.s64 %rd2, %rd29, %rd28;
- mov.u32 %r162, %ctaid.y;
- mul.lo.s32 %r163, %r162, %r4;
- mul.lo.s32 %r164, %r163, %r128;
+ mov.u32 %r161, %ctaid.y;
+ mul.lo.s32 %r162, %r161, %r4;
+ mul.lo.s32 %r163, %r162, %r128;
mul.lo.s32 %r14, %r128, %r4;
- mov.u32 %r165, %tid.z;
- mad.lo.s32 %r166, %r4, %r165, %r12;
- mad.lo.s32 %r15, %r166, %r3, %r10;
+ mov.u32 %r164, %tid.z;
+ mad.lo.s32 %r165, %r4, %r164, %r12;
+ mad.lo.s32 %r15, %r165, %r3, %r10;
mul.wide.u32 %rd30, %r15, 4;
add.s64 %rd3, %rd26, %rd30;
- clz.b32 %r167, %r3;
- mov.u32 %r168, 31;
- sub.s32 %r169, %r168, %r167;
- mov.u32 %r170, 1;
- shl.b32 %r16, %r170, %r169;
+ clz.b32 %r166, %r3;
+ mov.u32 %r167, 31;
+ sub.s32 %r168, %r167, %r166;
+ mov.u32 %r169, 1;
+ shl.b32 %r16, %r169, %r168;
setp.lt.u32 %p9, %r10, %r16;
- add.s32 %r171, %r16, %r10;
- setp.lt.u32 %p10, %r171, %r3;
+ add.s32 %r170, %r16, %r10;
+ setp.lt.u32 %p10, %r170, %r3;
and.pred %p1, %p9, %p10;
- add.s32 %r172, %r15, %r16;
- mul.wide.s32 %rd31, %r172, 4;
+ add.s32 %r171, %r15, %r16;
+ mul.wide.s32 %rd31, %r171, 4;
add.s64 %rd4, %rd26, %rd31;
- shr.u32 %r173, %r16, 31;
- add.s32 %r174, %r16, %r173;
- shr.s32 %r17, %r174, 1;
- add.s32 %r175, %r161, %r11;
+ shr.u32 %r172, %r16, 31;
+ add.s32 %r173, %r16, %r172;
+ shr.s32 %r17, %r173, 1;
+ shl.b32 %r174, %r12, 2;
+ mad.lo.s32 %r175, %r174, %r2, %r11;
mul.wide.s32 %rd32, %r175, 4;
add.s64 %rd5, %rd27, %rd32;
add.s32 %r176, %r15, 1;
mul.wide.u32 %rd33, %r176, 4;
add.s64 %rd6, %rd26, %rd33;
- mul.wide.s32 %rd34, %r166, 4;
+ mul.wide.s32 %rd34, %r165, 4;
add.s64 %rd7, %rd26, %rd34;
or.pred %p2, %p7, %p8;
- mul.lo.s32 %r18, %r9, %r162;
- mul.lo.s32 %r177, %r164, %r9;
+ mul.lo.s32 %r18, %r9, %r161;
+ mul.lo.s32 %r177, %r163, %r9;
cvt.s64.s32 %rd8, %r177;
cvta.shared.u64 %rd35, %rd26;
add.s64 %rd10, %rd35, %rd25;
mov.u32 %r330, 0;
mov.f32 %f163, 0f00000000;
{ .reg .u64 smem_ptr; cvta.to.shared.u64 smem_ptr, %rd10; cvt.u32.u64 %r180, smem_ptr; }
- add.s32 %r181, %r13, %r180;
+ add.s32 %r181, %r180, %r13;
not.pred %p16, %p1;
mov.f32 %f164, %f163;
mov.f32 %f165, %f163;
mov.f32 %f166, %f163;